pdf_search 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/pdf_search +7 -8
- data/html/index.html.erb +49 -0
- data/lib/elastic_search_query.rb +41 -0
- data/lib/pdf_dir.rb +20 -20
- data/lib/pdf_index.rb +26 -2
- data/lib/pdf_search.rb +5 -4
- data/lib/search_view.rb +25 -0
- data/lib/web_server.rb +36 -16
- metadata +19 -3
- data/html/index.html +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 818cadfa2eedfd47ab0ff7d18b19e6bff5da3578a29c85bffa1e2c9e79f25a7f
|
4
|
+
data.tar.gz: db1416b33897d06fbf2700516fb1668fded85a2b8c849f350708438e1145fd16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d2a6705f676e96b751d28f4f42eb6fe9073d90524c0515bc2a49437ec80f2650c6e2afd7346f1bb36c3b02e37c5f7419a4450fc304b3accaf5906ef8bb37b7c5
|
7
|
+
data.tar.gz: 7c0ffc3861dbcdacf690a741926bd61ad46f545bc761b0c1056f74f7f7d9e9203858809b236a15a52d9731682a0d4caefe23f8fceb1c2f5caee69dc91112c8bb
|
data/bin/pdf_search
CHANGED
@@ -2,10 +2,6 @@
|
|
2
2
|
require 'pdf_search'
|
3
3
|
require 'webrick'
|
4
4
|
|
5
|
-
|
6
|
-
PdfSearch.start_webserver
|
7
|
-
|
8
|
-
|
9
5
|
if ARGV[1]
|
10
6
|
require File.expand_path(ARGV[1])
|
11
7
|
|
@@ -13,14 +9,17 @@ if ARGV[1]
|
|
13
9
|
raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
|
14
10
|
end
|
15
11
|
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
search_index_class = eval(ARGV[2])
|
13
|
+
search_index_class.create_index
|
14
|
+
search_index = search_index_class.start_daemon(ARGV[0])
|
19
15
|
else
|
20
16
|
PdfSearch.create_elasticsearch_index
|
21
|
-
PdfSearch::PdfIndex.start_daemon(ARGV[0])
|
17
|
+
search_index = PdfSearch::PdfIndex.start_daemon(ARGV[0])
|
22
18
|
end
|
23
19
|
|
20
|
+
PdfSearch.start_webserver(search_index)
|
21
|
+
|
22
|
+
|
24
23
|
|
25
24
|
|
26
25
|
|
data/html/index.html.erb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
<h1> Keyword (can be blank) </h1>
|
2
|
+
<input id='search' />
|
3
|
+
<% if !search_index.search_input_fields.nil? %>
|
4
|
+
<% search_index.search_input_fields.each do |search_input_field_name, type| %>
|
5
|
+
<% if type == :interval %>
|
6
|
+
<h1><%= search_input_field_name %></h1>
|
7
|
+
|
8
|
+
<h2> Range start </h2>
|
9
|
+
<input id="search_<%= search_input_field_name %>_start" />
|
10
|
+
<h2> Range end </h2>
|
11
|
+
<input id="search_<%= search_input_field_name %>_end" />
|
12
|
+
<% end %>
|
13
|
+
<% end %>
|
14
|
+
<% end %>
|
15
|
+
|
16
|
+
<button id='doSearch'>Search</button>
|
17
|
+
|
18
|
+
<ul id='results'>
|
19
|
+
</ul>
|
20
|
+
|
21
|
+
|
22
|
+
<script>
|
23
|
+
document.getElementById('doSearch').addEventListener('click', function(e) {
|
24
|
+
let Http = new XMLHttpRequest();
|
25
|
+
let url= encodeURI(window.location.origin + "/search");
|
26
|
+
|
27
|
+
let data = {};
|
28
|
+
|
29
|
+
data["search"] = document.getElementById('search').value;
|
30
|
+
|
31
|
+
<% if !search_index.search_input_fields.nil? %>
|
32
|
+
<% search_index.search_input_fields.each do |search_input_field_name, type| %>
|
33
|
+
<% if type == :interval %>
|
34
|
+
data["search_<%= search_input_field_name %>_start"] = document.getElementById("search_<%= search_input_field_name %>_start").value
|
35
|
+
data["search_<%= search_input_field_name %>_end"] = document.getElementById("search_<%= search_input_field_name %>_end").value
|
36
|
+
<% end %>
|
37
|
+
<% end %>
|
38
|
+
<% end %>
|
39
|
+
|
40
|
+
Http.open("POST", url);
|
41
|
+
Http.setRequestHeader('Content-Type', 'application/json; charset=UTF-8');
|
42
|
+
|
43
|
+
Http.send(JSON.stringify(data));
|
44
|
+
|
45
|
+
Http.onreadystatechange=(we)=> {
|
46
|
+
document.getElementById('results').innerHTML = Http.responseText;
|
47
|
+
}
|
48
|
+
});
|
49
|
+
</script>
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'elasticsearch/dsl'
|
2
|
+
module PdfSearch
|
3
|
+
class ElasticSearchQuery
|
4
|
+
include Elasticsearch::DSL
|
5
|
+
attr_reader :query_specification, :search_index
|
6
|
+
|
7
|
+
def initialize(query_specification, search_index)
|
8
|
+
@search_index = search_index
|
9
|
+
@query_specification = query_specification
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_hash
|
13
|
+
{
|
14
|
+
"query": {
|
15
|
+
"bool": {
|
16
|
+
"must": [
|
17
|
+
{
|
18
|
+
"match": {
|
19
|
+
"text": query_specification['search']
|
20
|
+
}
|
21
|
+
}
|
22
|
+
].concat(range_queries)
|
23
|
+
}
|
24
|
+
}
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def range_queries
|
29
|
+
search_index.search_input_fields_by_type[:interval].map do |name|
|
30
|
+
{
|
31
|
+
"range": {
|
32
|
+
name => {
|
33
|
+
gte: query_specification["search_#{name}_start"],
|
34
|
+
lte: query_specification["search_#{name}_end"]
|
35
|
+
}
|
36
|
+
}
|
37
|
+
}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/pdf_dir.rb
CHANGED
@@ -4,30 +4,30 @@ require 'digest'
|
|
4
4
|
|
5
5
|
module PdfSearch
|
6
6
|
class PdfDir
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
def initialize(dir = '.')
|
8
|
+
@dir = dir
|
9
|
+
end
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
def pdf_file_paths
|
12
|
+
Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
def pdf_documents
|
16
|
+
Enumerator.new do |e|
|
17
|
+
pdf_file_paths.each do |pdf_file_path|
|
18
|
+
e << PDF::Reader.new(pdf_file_path)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
23
|
+
def pages
|
24
|
+
Enumerator.new do |e|
|
25
|
+
pdf_documents.each do |pdf_reader|
|
26
|
+
pdf_reader.pages.each do |page|
|
27
|
+
e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
|
28
|
+
end
|
29
|
+
end
|
28
30
|
end
|
29
31
|
end
|
30
|
-
end
|
31
|
-
end
|
32
32
|
end
|
33
33
|
end
|
data/lib/pdf_index.rb
CHANGED
@@ -3,7 +3,11 @@ require 'elasticsearch'
|
|
3
3
|
require 'pdf_dir'
|
4
4
|
require 'digest'
|
5
5
|
class PdfSearch::PdfIndex
|
6
|
-
|
6
|
+
attr_accessor :daemon
|
7
|
+
|
8
|
+
class << self
|
9
|
+
attr_reader :search_input_fields, :properties
|
10
|
+
end
|
7
11
|
|
8
12
|
def self.create_index
|
9
13
|
::PdfSearch::ElasticSearchClient.indices.create(
|
@@ -31,9 +35,29 @@ class PdfSearch::PdfIndex
|
|
31
35
|
end
|
32
36
|
end
|
33
37
|
|
34
|
-
def self.property(property_name, type)
|
38
|
+
def self.property(property_name, type, options)
|
35
39
|
@properties ||= {}
|
36
40
|
@properties[property_name] = {type: type}
|
41
|
+
|
42
|
+
@search_input_fields ||= {}
|
43
|
+
@search_input_fields_by_type ||= {}
|
44
|
+
|
45
|
+
search_input_type = options.delete(:search)
|
46
|
+
@search_input_fields[property_name] = search_input_type
|
47
|
+
@search_input_fields_by_type[search_input_type] ||= []
|
48
|
+
@search_input_fields_by_type[search_input_type].push(property_name)
|
49
|
+
end
|
50
|
+
|
51
|
+
def search_input_fields_by_type
|
52
|
+
self.class.instance_variable_get(:@search_input_fields_by_type)
|
53
|
+
end
|
54
|
+
|
55
|
+
def search_input_fields
|
56
|
+
self.class.search_input_fields
|
57
|
+
end
|
58
|
+
|
59
|
+
def properties
|
60
|
+
self.class.properties
|
37
61
|
end
|
38
62
|
|
39
63
|
def initialize(pdf_dir)
|
data/lib/pdf_search.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
+
require 'elastic_search_query'
|
1
2
|
require 'pdf_index'
|
2
3
|
require 'daemons'
|
4
|
+
require 'search_view'
|
3
5
|
require 'pdf_dir'
|
4
6
|
require 'elasticsearch'
|
5
7
|
require 'web_server'
|
6
8
|
|
7
9
|
module PdfSearch
|
8
|
-
|
9
|
-
ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
|
10
|
+
ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
|
10
11
|
|
11
12
|
GemDir = [File.dirname(__FILE__), '..']
|
12
13
|
|
@@ -14,8 +15,8 @@ module PdfSearch
|
|
14
15
|
File.expand_path(File.join(GemDir + path_array))
|
15
16
|
end
|
16
17
|
|
17
|
-
def self.start_webserver
|
18
|
-
web_server = WebServer.new
|
18
|
+
def self.start_webserver(search_index)
|
19
|
+
web_server = WebServer.new(search_index)
|
19
20
|
web_server.start
|
20
21
|
return web_server.daemon.pid.pid
|
21
22
|
end
|
data/lib/search_view.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'erb'
|
2
|
+
require 'pry'
|
3
|
+
|
4
|
+
module PdfSearch
|
5
|
+
class SearchView
|
6
|
+
def initialize(search_index)
|
7
|
+
@search_index = search_index
|
8
|
+
end
|
9
|
+
|
10
|
+
def search_view_erb_template
|
11
|
+
File.read(::PdfSearch.relative_to_gem_path(['html', 'index.html.erb']))
|
12
|
+
end
|
13
|
+
|
14
|
+
def render
|
15
|
+
template = ERB.new(search_view_erb_template)
|
16
|
+
template.result(binding)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def search_index
|
22
|
+
@search_index
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/web_server.rb
CHANGED
@@ -1,9 +1,16 @@
|
|
1
1
|
require 'webrick'
|
2
2
|
require 'json'
|
3
|
+
require 'pry-remote'
|
4
|
+
|
3
5
|
|
4
6
|
class PdfSearch::WebServer
|
5
7
|
attr_accessor :daemon
|
6
8
|
|
9
|
+
def initialize(search_index, debug_mode = false)
|
10
|
+
@search_index = search_index
|
11
|
+
@debug_mode = (ENV['DEBUG_PDF_SEARCH'] == '1') || debug_mode
|
12
|
+
end
|
13
|
+
|
7
14
|
def basic_auth
|
8
15
|
return @basic_auth if @basic_auth != nil
|
9
16
|
|
@@ -15,31 +22,44 @@ class PdfSearch::WebServer
|
|
15
22
|
|
16
23
|
config[:UserDB] = htpasswd
|
17
24
|
|
18
|
-
puts config.inspect
|
19
|
-
|
20
25
|
@basic_auth = WEBrick::HTTPAuth::BasicAuth.new config
|
21
|
-
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def index(request, response)
|
29
|
+
search_view = ::PdfSearch::SearchView.new(@search_index)
|
30
|
+
response.status = 200
|
31
|
+
response['Content-Type'] = 'text/html'
|
32
|
+
response.body = search_view.render
|
33
|
+
end
|
34
|
+
|
35
|
+
def auth_defined?
|
36
|
+
[ENV['PDF_SEARCH_USERNAME'], ENV['PDF_SEARCH_PASSWORD']].all? do |env_var|
|
37
|
+
env_var != nil && env_var != ''
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def search(request, response)
|
42
|
+
if auth_defined?
|
43
|
+
basic_auth.authenticate(request, response)
|
44
|
+
end
|
45
|
+
|
46
|
+
query = ::PdfSearch::ElasticSearchQuery.new(JSON.parse(request.body), @search_index)
|
47
|
+
elastic_response = ::PdfSearch::ElasticSearchClient.search index: 'pdf_pages', body: query.to_hash
|
48
|
+
|
49
|
+
response.body = response_html(elastic_response)
|
50
|
+
end
|
22
51
|
|
23
52
|
def start
|
24
53
|
start_server = lambda do
|
25
|
-
server = WEBrick::HTTPServer.new(:Port => 80
|
26
|
-
|
27
|
-
server.mount_proc '/search' do |request, response|
|
28
|
-
if [ENV['PDF_SEARCH_USERNAME'], ENV['PDF_SEARCH_PASSWORD']].all? do |env_var|
|
29
|
-
env_var != nil && env_var != ''
|
30
|
-
end
|
54
|
+
server = WEBrick::HTTPServer.new(:Port => 80)
|
31
55
|
|
32
|
-
|
33
|
-
|
34
|
-
query = request.query["query"]
|
35
|
-
elastic_response = ::PdfSearch::ElasticSearchClient.search q: query, size: 200
|
36
|
-
response.body = response_html(elastic_response)
|
37
|
-
end
|
56
|
+
server.mount_proc '/', &method(:index)
|
57
|
+
server.mount_proc '/search', &method(:search)
|
38
58
|
|
39
59
|
server.start
|
40
60
|
end
|
41
61
|
|
42
|
-
if
|
62
|
+
if @debug_mode
|
43
63
|
start_server.call
|
44
64
|
else
|
45
65
|
self.daemon = Daemons.call(multiple: true, &start_server)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Arno Korfmann
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: elasticsearch-dsl
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: rake
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -118,10 +132,12 @@ extra_rdoc_files: []
|
|
118
132
|
files:
|
119
133
|
- bin/pdf_search
|
120
134
|
- config/elasticsearch.yml
|
121
|
-
- html/index.html
|
135
|
+
- html/index.html.erb
|
136
|
+
- lib/elastic_search_query.rb
|
122
137
|
- lib/pdf_dir.rb
|
123
138
|
- lib/pdf_index.rb
|
124
139
|
- lib/pdf_search.rb
|
140
|
+
- lib/search_view.rb
|
125
141
|
- lib/web_server.rb
|
126
142
|
- log/pdf_index.log
|
127
143
|
homepage: https://github.com/banalBI/pdfsearch
|
@@ -144,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
144
160
|
version: '0'
|
145
161
|
requirements: []
|
146
162
|
rubyforge_project:
|
147
|
-
rubygems_version:
|
163
|
+
rubygems_version: 3.0.0.beta3
|
148
164
|
signing_key:
|
149
165
|
specification_version: 4
|
150
166
|
summary: Searching pdfs by leveragin Elasticsearch
|
data/html/index.html
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
<input id='search'></input>
|
2
|
-
|
3
|
-
<button id='doSearch'>Search</button>
|
4
|
-
|
5
|
-
<ul id='results'>
|
6
|
-
</ul>
|
7
|
-
|
8
|
-
|
9
|
-
<script>
|
10
|
-
document.getElementById('doSearch').addEventListener('click', function(e) {
|
11
|
-
let Http = new XMLHttpRequest();
|
12
|
-
let url= encodeURI(window.location.origin + "/search?query="+document.getElementById('search').value);
|
13
|
-
Http.open("GET", url);
|
14
|
-
Http.send();
|
15
|
-
|
16
|
-
Http.onreadystatechange=(e)=> {
|
17
|
-
document.getElementById('results').innerHTML = Http.responseText;
|
18
|
-
}
|
19
|
-
});
|
20
|
-
</script>
|