pdf_search 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/pdf_search +7 -8
- data/html/index.html.erb +49 -0
- data/lib/elastic_search_query.rb +41 -0
- data/lib/pdf_dir.rb +20 -20
- data/lib/pdf_index.rb +26 -2
- data/lib/pdf_search.rb +5 -4
- data/lib/search_view.rb +25 -0
- data/lib/web_server.rb +36 -16
- metadata +19 -3
- data/html/index.html +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 818cadfa2eedfd47ab0ff7d18b19e6bff5da3578a29c85bffa1e2c9e79f25a7f
|
4
|
+
data.tar.gz: db1416b33897d06fbf2700516fb1668fded85a2b8c849f350708438e1145fd16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d2a6705f676e96b751d28f4f42eb6fe9073d90524c0515bc2a49437ec80f2650c6e2afd7346f1bb36c3b02e37c5f7419a4450fc304b3accaf5906ef8bb37b7c5
|
7
|
+
data.tar.gz: 7c0ffc3861dbcdacf690a741926bd61ad46f545bc761b0c1056f74f7f7d9e9203858809b236a15a52d9731682a0d4caefe23f8fceb1c2f5caee69dc91112c8bb
|
data/bin/pdf_search
CHANGED
@@ -2,10 +2,6 @@
|
|
2
2
|
require 'pdf_search'
|
3
3
|
require 'webrick'
|
4
4
|
|
5
|
-
|
6
|
-
PdfSearch.start_webserver
|
7
|
-
|
8
|
-
|
9
5
|
if ARGV[1]
|
10
6
|
require File.expand_path(ARGV[1])
|
11
7
|
|
@@ -13,14 +9,17 @@ if ARGV[1]
|
|
13
9
|
raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
|
14
10
|
end
|
15
11
|
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
search_index_class = eval(ARGV[2])
|
13
|
+
search_index_class.create_index
|
14
|
+
search_index = search_index_class.start_daemon(ARGV[0])
|
19
15
|
else
|
20
16
|
PdfSearch.create_elasticsearch_index
|
21
|
-
PdfSearch::PdfIndex.start_daemon(ARGV[0])
|
17
|
+
search_index = PdfSearch::PdfIndex.start_daemon(ARGV[0])
|
22
18
|
end
|
23
19
|
|
20
|
+
PdfSearch.start_webserver(search_index)
|
21
|
+
|
22
|
+
|
24
23
|
|
25
24
|
|
26
25
|
|
data/html/index.html.erb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
<h1> Keyword (can be blank) </h1>
|
2
|
+
<input id='search' />
|
3
|
+
<% if !search_index.search_input_fields.nil? %>
|
4
|
+
<% search_index.search_input_fields.each do |search_input_field_name, type| %>
|
5
|
+
<% if type == :interval %>
|
6
|
+
<h1><%= search_input_field_name %></h1>
|
7
|
+
|
8
|
+
<h2> Range start </h2>
|
9
|
+
<input id="search_<%= search_input_field_name %>_start" />
|
10
|
+
<h2> Range end </h2>
|
11
|
+
<input id="search_<%= search_input_field_name %>_end" />
|
12
|
+
<% end %>
|
13
|
+
<% end %>
|
14
|
+
<% end %>
|
15
|
+
|
16
|
+
<button id='doSearch'>Search</button>
|
17
|
+
|
18
|
+
<ul id='results'>
|
19
|
+
</ul>
|
20
|
+
|
21
|
+
|
22
|
+
<script>
|
23
|
+
document.getElementById('doSearch').addEventListener('click', function(e) {
|
24
|
+
let Http = new XMLHttpRequest();
|
25
|
+
let url= encodeURI(window.location.origin + "/search");
|
26
|
+
|
27
|
+
let data = {};
|
28
|
+
|
29
|
+
data["search"] = document.getElementById('search').value;
|
30
|
+
|
31
|
+
<% if !search_index.search_input_fields.nil? %>
|
32
|
+
<% search_index.search_input_fields.each do |search_input_field_name, type| %>
|
33
|
+
<% if type == :interval %>
|
34
|
+
data["search_<%= search_input_field_name %>_start"] = document.getElementById("search_<%= search_input_field_name %>_start").value
|
35
|
+
data["search_<%= search_input_field_name %>_end"] = document.getElementById("search_<%= search_input_field_name %>_end").value
|
36
|
+
<% end %>
|
37
|
+
<% end %>
|
38
|
+
<% end %>
|
39
|
+
|
40
|
+
Http.open("POST", url);
|
41
|
+
Http.setRequestHeader('Content-Type', 'application/json; charset=UTF-8');
|
42
|
+
|
43
|
+
Http.send(JSON.stringify(data));
|
44
|
+
|
45
|
+
Http.onreadystatechange=(we)=> {
|
46
|
+
document.getElementById('results').innerHTML = Http.responseText;
|
47
|
+
}
|
48
|
+
});
|
49
|
+
</script>
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'elasticsearch/dsl'
|
2
|
+
module PdfSearch
|
3
|
+
class ElasticSearchQuery
|
4
|
+
include Elasticsearch::DSL
|
5
|
+
attr_reader :query_specification, :search_index
|
6
|
+
|
7
|
+
def initialize(query_specification, search_index)
|
8
|
+
@search_index = search_index
|
9
|
+
@query_specification = query_specification
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_hash
|
13
|
+
{
|
14
|
+
"query": {
|
15
|
+
"bool": {
|
16
|
+
"must": [
|
17
|
+
{
|
18
|
+
"match": {
|
19
|
+
"text": query_specification['search']
|
20
|
+
}
|
21
|
+
}
|
22
|
+
].concat(range_queries)
|
23
|
+
}
|
24
|
+
}
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def range_queries
|
29
|
+
search_index.search_input_fields_by_type[:interval].map do |name|
|
30
|
+
{
|
31
|
+
"range": {
|
32
|
+
name => {
|
33
|
+
gte: query_specification["search_#{name}_start"],
|
34
|
+
lte: query_specification["search_#{name}_end"]
|
35
|
+
}
|
36
|
+
}
|
37
|
+
}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/pdf_dir.rb
CHANGED
@@ -4,30 +4,30 @@ require 'digest'
|
|
4
4
|
|
5
5
|
module PdfSearch
|
6
6
|
class PdfDir
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
def initialize(dir = '.')
|
8
|
+
@dir = dir
|
9
|
+
end
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
def pdf_file_paths
|
12
|
+
Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
def pdf_documents
|
16
|
+
Enumerator.new do |e|
|
17
|
+
pdf_file_paths.each do |pdf_file_path|
|
18
|
+
e << PDF::Reader.new(pdf_file_path)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
23
|
+
def pages
|
24
|
+
Enumerator.new do |e|
|
25
|
+
pdf_documents.each do |pdf_reader|
|
26
|
+
pdf_reader.pages.each do |page|
|
27
|
+
e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
|
28
|
+
end
|
29
|
+
end
|
28
30
|
end
|
29
31
|
end
|
30
|
-
end
|
31
|
-
end
|
32
32
|
end
|
33
33
|
end
|
data/lib/pdf_index.rb
CHANGED
@@ -3,7 +3,11 @@ require 'elasticsearch'
|
|
3
3
|
require 'pdf_dir'
|
4
4
|
require 'digest'
|
5
5
|
class PdfSearch::PdfIndex
|
6
|
-
|
6
|
+
attr_accessor :daemon
|
7
|
+
|
8
|
+
class << self
|
9
|
+
attr_reader :search_input_fields, :properties
|
10
|
+
end
|
7
11
|
|
8
12
|
def self.create_index
|
9
13
|
::PdfSearch::ElasticSearchClient.indices.create(
|
@@ -31,9 +35,29 @@ class PdfSearch::PdfIndex
|
|
31
35
|
end
|
32
36
|
end
|
33
37
|
|
34
|
-
def self.property(property_name, type)
|
38
|
+
def self.property(property_name, type, options)
|
35
39
|
@properties ||= {}
|
36
40
|
@properties[property_name] = {type: type}
|
41
|
+
|
42
|
+
@search_input_fields ||= {}
|
43
|
+
@search_input_fields_by_type ||= {}
|
44
|
+
|
45
|
+
search_input_type = options.delete(:search)
|
46
|
+
@search_input_fields[property_name] = search_input_type
|
47
|
+
@search_input_fields_by_type[search_input_type] ||= []
|
48
|
+
@search_input_fields_by_type[search_input_type].push(property_name)
|
49
|
+
end
|
50
|
+
|
51
|
+
def search_input_fields_by_type
|
52
|
+
self.class.instance_variable_get(:@search_input_fields_by_type)
|
53
|
+
end
|
54
|
+
|
55
|
+
def search_input_fields
|
56
|
+
self.class.search_input_fields
|
57
|
+
end
|
58
|
+
|
59
|
+
def properties
|
60
|
+
self.class.properties
|
37
61
|
end
|
38
62
|
|
39
63
|
def initialize(pdf_dir)
|
data/lib/pdf_search.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
+
require 'elastic_search_query'
|
1
2
|
require 'pdf_index'
|
2
3
|
require 'daemons'
|
4
|
+
require 'search_view'
|
3
5
|
require 'pdf_dir'
|
4
6
|
require 'elasticsearch'
|
5
7
|
require 'web_server'
|
6
8
|
|
7
9
|
module PdfSearch
|
8
|
-
|
9
|
-
ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
|
10
|
+
ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
|
10
11
|
|
11
12
|
GemDir = [File.dirname(__FILE__), '..']
|
12
13
|
|
@@ -14,8 +15,8 @@ module PdfSearch
|
|
14
15
|
File.expand_path(File.join(GemDir + path_array))
|
15
16
|
end
|
16
17
|
|
17
|
-
def self.start_webserver
|
18
|
-
web_server = WebServer.new
|
18
|
+
def self.start_webserver(search_index)
|
19
|
+
web_server = WebServer.new(search_index)
|
19
20
|
web_server.start
|
20
21
|
return web_server.daemon.pid.pid
|
21
22
|
end
|
data/lib/search_view.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'erb'
|
2
|
+
require 'pry'
|
3
|
+
|
4
|
+
module PdfSearch
|
5
|
+
class SearchView
|
6
|
+
def initialize(search_index)
|
7
|
+
@search_index = search_index
|
8
|
+
end
|
9
|
+
|
10
|
+
def search_view_erb_template
|
11
|
+
File.read(::PdfSearch.relative_to_gem_path(['html', 'index.html.erb']))
|
12
|
+
end
|
13
|
+
|
14
|
+
def render
|
15
|
+
template = ERB.new(search_view_erb_template)
|
16
|
+
template.result(binding)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def search_index
|
22
|
+
@search_index
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/web_server.rb
CHANGED
@@ -1,9 +1,16 @@
|
|
1
1
|
require 'webrick'
|
2
2
|
require 'json'
|
3
|
+
require 'pry-remote'
|
4
|
+
|
3
5
|
|
4
6
|
class PdfSearch::WebServer
|
5
7
|
attr_accessor :daemon
|
6
8
|
|
9
|
+
def initialize(search_index, debug_mode = false)
|
10
|
+
@search_index = search_index
|
11
|
+
@debug_mode = (ENV['DEBUG_PDF_SEARCH'] == '1') || debug_mode
|
12
|
+
end
|
13
|
+
|
7
14
|
def basic_auth
|
8
15
|
return @basic_auth if @basic_auth != nil
|
9
16
|
|
@@ -15,31 +22,44 @@ class PdfSearch::WebServer
|
|
15
22
|
|
16
23
|
config[:UserDB] = htpasswd
|
17
24
|
|
18
|
-
puts config.inspect
|
19
|
-
|
20
25
|
@basic_auth = WEBrick::HTTPAuth::BasicAuth.new config
|
21
|
-
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def index(request, response)
|
29
|
+
search_view = ::PdfSearch::SearchView.new(@search_index)
|
30
|
+
response.status = 200
|
31
|
+
response['Content-Type'] = 'text/html'
|
32
|
+
response.body = search_view.render
|
33
|
+
end
|
34
|
+
|
35
|
+
def auth_defined?
|
36
|
+
[ENV['PDF_SEARCH_USERNAME'], ENV['PDF_SEARCH_PASSWORD']].all? do |env_var|
|
37
|
+
env_var != nil && env_var != ''
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def search(request, response)
|
42
|
+
if auth_defined?
|
43
|
+
basic_auth.authenticate(request, response)
|
44
|
+
end
|
45
|
+
|
46
|
+
query = ::PdfSearch::ElasticSearchQuery.new(JSON.parse(request.body), @search_index)
|
47
|
+
elastic_response = ::PdfSearch::ElasticSearchClient.search index: 'pdf_pages', body: query.to_hash
|
48
|
+
|
49
|
+
response.body = response_html(elastic_response)
|
50
|
+
end
|
22
51
|
|
23
52
|
def start
|
24
53
|
start_server = lambda do
|
25
|
-
server = WEBrick::HTTPServer.new(:Port => 80
|
26
|
-
|
27
|
-
server.mount_proc '/search' do |request, response|
|
28
|
-
if [ENV['PDF_SEARCH_USERNAME'], ENV['PDF_SEARCH_PASSWORD']].all? do |env_var|
|
29
|
-
env_var != nil && env_var != ''
|
30
|
-
end
|
54
|
+
server = WEBrick::HTTPServer.new(:Port => 80)
|
31
55
|
|
32
|
-
|
33
|
-
|
34
|
-
query = request.query["query"]
|
35
|
-
elastic_response = ::PdfSearch::ElasticSearchClient.search q: query, size: 200
|
36
|
-
response.body = response_html(elastic_response)
|
37
|
-
end
|
56
|
+
server.mount_proc '/', &method(:index)
|
57
|
+
server.mount_proc '/search', &method(:search)
|
38
58
|
|
39
59
|
server.start
|
40
60
|
end
|
41
61
|
|
42
|
-
if
|
62
|
+
if @debug_mode
|
43
63
|
start_server.call
|
44
64
|
else
|
45
65
|
self.daemon = Daemons.call(multiple: true, &start_server)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Arno Korfmann
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: elasticsearch-dsl
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: rake
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -118,10 +132,12 @@ extra_rdoc_files: []
|
|
118
132
|
files:
|
119
133
|
- bin/pdf_search
|
120
134
|
- config/elasticsearch.yml
|
121
|
-
- html/index.html
|
135
|
+
- html/index.html.erb
|
136
|
+
- lib/elastic_search_query.rb
|
122
137
|
- lib/pdf_dir.rb
|
123
138
|
- lib/pdf_index.rb
|
124
139
|
- lib/pdf_search.rb
|
140
|
+
- lib/search_view.rb
|
125
141
|
- lib/web_server.rb
|
126
142
|
- log/pdf_index.log
|
127
143
|
homepage: https://github.com/banalBI/pdfsearch
|
@@ -144,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
144
160
|
version: '0'
|
145
161
|
requirements: []
|
146
162
|
rubyforge_project:
|
147
|
-
rubygems_version:
|
163
|
+
rubygems_version: 3.0.0.beta3
|
148
164
|
signing_key:
|
149
165
|
specification_version: 4
|
150
166
|
summary: Searching pdfs by leveragin Elasticsearch
|
data/html/index.html
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
<input id='search'></input>
|
2
|
-
|
3
|
-
<button id='doSearch'>Search</button>
|
4
|
-
|
5
|
-
<ul id='results'>
|
6
|
-
</ul>
|
7
|
-
|
8
|
-
|
9
|
-
<script>
|
10
|
-
document.getElementById('doSearch').addEventListener('click', function(e) {
|
11
|
-
let Http = new XMLHttpRequest();
|
12
|
-
let url= encodeURI(window.location.origin + "/search?query="+document.getElementById('search').value);
|
13
|
-
Http.open("GET", url);
|
14
|
-
Http.send();
|
15
|
-
|
16
|
-
Http.onreadystatechange=(e)=> {
|
17
|
-
document.getElementById('results').innerHTML = Http.responseText;
|
18
|
-
}
|
19
|
-
});
|
20
|
-
</script>
|