pdf_search 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 190956cae75ce3b900499a7d1dac12ebe3807078a85610266b25bfbfb4b6f1a1
4
- data.tar.gz: c66b5e9ebbc690b8477e9ddd86cf81252b3c4772afebfc47e1bc1d7c8c4e6c2e
3
+ metadata.gz: 818cadfa2eedfd47ab0ff7d18b19e6bff5da3578a29c85bffa1e2c9e79f25a7f
4
+ data.tar.gz: db1416b33897d06fbf2700516fb1668fded85a2b8c849f350708438e1145fd16
5
5
  SHA512:
6
- metadata.gz: d5508109f2014f2343700ab66c13482d3d5417bc51cbbd3bd1e3a3cd3a6ef108e13722430e9de6078e939ed093cc34048dbf19f5894d6d225fd54c7836adc6cf
7
- data.tar.gz: 1b9c8910e4d95144f8c5f28057697883f2d5da8a8099ebe714289e767841c8ec2579f7d8406126db2c5372049a3ffbdf5367afcba85a96b05e5c5c7bd9b396f6
6
+ metadata.gz: d2a6705f676e96b751d28f4f42eb6fe9073d90524c0515bc2a49437ec80f2650c6e2afd7346f1bb36c3b02e37c5f7419a4450fc304b3accaf5906ef8bb37b7c5
7
+ data.tar.gz: 7c0ffc3861dbcdacf690a741926bd61ad46f545bc761b0c1056f74f7f7d9e9203858809b236a15a52d9731682a0d4caefe23f8fceb1c2f5caee69dc91112c8bb
@@ -2,10 +2,6 @@
2
2
  require 'pdf_search'
3
3
  require 'webrick'
4
4
 
5
-
6
- PdfSearch.start_webserver
7
-
8
-
9
5
  if ARGV[1]
10
6
  require File.expand_path(ARGV[1])
11
7
 
@@ -13,14 +9,17 @@ if ARGV[1]
13
9
  raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
14
10
  end
15
11
 
16
- custom_index = eval(ARGV[2])
17
- custom_index.create_index
18
- custom_index.start_daemon(ARGV[0])
12
+ search_index_class = eval(ARGV[2])
13
+ search_index_class.create_index
14
+ search_index = search_index_class.start_daemon(ARGV[0])
19
15
  else
20
16
  PdfSearch.create_elasticsearch_index
21
- PdfSearch::PdfIndex.start_daemon(ARGV[0])
17
+ search_index = PdfSearch::PdfIndex.start_daemon(ARGV[0])
22
18
  end
23
19
 
20
+ PdfSearch.start_webserver(search_index)
21
+
22
+
24
23
 
25
24
 
26
25
 
@@ -0,0 +1,49 @@
1
+ <h1> Keyword (can be blank) </h1>
2
+ <input id='search' />
3
+ <% if !search_index.search_input_fields.nil? %>
4
+ <% search_index.search_input_fields.each do |search_input_field_name, type| %>
5
+ <% if type == :interval %>
6
+ <h1><%= search_input_field_name %></h1>
7
+
8
+ <h2> Range start </h2>
9
+ <input id="search_<%= search_input_field_name %>_start" />
10
+ <h2> Range end </h2>
11
+ <input id="search_<%= search_input_field_name %>_end" />
12
+ <% end %>
13
+ <% end %>
14
+ <% end %>
15
+
16
+ <button id='doSearch'>Search</button>
17
+
18
+ <ul id='results'>
19
+ </ul>
20
+
21
+
22
+ <script>
23
+ document.getElementById('doSearch').addEventListener('click', function(e) {
24
+ let Http = new XMLHttpRequest();
25
+ let url= encodeURI(window.location.origin + "/search");
26
+
27
+ let data = {};
28
+
29
+ data["search"] = document.getElementById('search').value;
30
+
31
+ <% if !search_index.search_input_fields.nil? %>
32
+ <% search_index.search_input_fields.each do |search_input_field_name, type| %>
33
+ <% if type == :interval %>
34
+ data["search_<%= search_input_field_name %>_start"] = document.getElementById("search_<%= search_input_field_name %>_start").value
35
+ data["search_<%= search_input_field_name %>_end"] = document.getElementById("search_<%= search_input_field_name %>_end").value
36
+ <% end %>
37
+ <% end %>
38
+ <% end %>
39
+
40
+ Http.open("POST", url);
41
+ Http.setRequestHeader('Content-Type', 'application/json; charset=UTF-8');
42
+
43
+ Http.send(JSON.stringify(data));
44
+
45
+ Http.onreadystatechange=(we)=> {
46
+ document.getElementById('results').innerHTML = Http.responseText;
47
+ }
48
+ });
49
+ </script>
@@ -0,0 +1,41 @@
1
+ require 'elasticsearch/dsl'
2
+ module PdfSearch
3
+ class ElasticSearchQuery
4
+ include Elasticsearch::DSL
5
+ attr_reader :query_specification, :search_index
6
+
7
+ def initialize(query_specification, search_index)
8
+ @search_index = search_index
9
+ @query_specification = query_specification
10
+ end
11
+
12
+ def to_hash
13
+ {
14
+ "query": {
15
+ "bool": {
16
+ "must": [
17
+ {
18
+ "match": {
19
+ "text": query_specification['search']
20
+ }
21
+ }
22
+ ].concat(range_queries)
23
+ }
24
+ }
25
+ }
26
+ end
27
+
28
+ def range_queries
29
+ search_index.search_input_fields_by_type[:interval].map do |name|
30
+ {
31
+ "range": {
32
+ name => {
33
+ gte: query_specification["search_#{name}_start"],
34
+ lte: query_specification["search_#{name}_end"]
35
+ }
36
+ }
37
+ }
38
+ end
39
+ end
40
+ end
41
+ end
@@ -4,30 +4,30 @@ require 'digest'
4
4
 
5
5
  module PdfSearch
6
6
  class PdfDir
7
- def initialize(dir = '.')
8
- @dir = dir
9
- end
7
+ def initialize(dir = '.')
8
+ @dir = dir
9
+ end
10
10
 
11
- def pdf_file_paths
12
- Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
13
- end
11
+ def pdf_file_paths
12
+ Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
13
+ end
14
14
 
15
- def pdf_documents
16
- Enumerator.new do |e|
17
- pdf_file_paths.each do |pdf_file_path|
18
- e << PDF::Reader.new(pdf_file_path)
19
- end
20
- end
21
- end
15
+ def pdf_documents
16
+ Enumerator.new do |e|
17
+ pdf_file_paths.each do |pdf_file_path|
18
+ e << PDF::Reader.new(pdf_file_path)
19
+ end
20
+ end
21
+ end
22
22
 
23
- def pages
24
- Enumerator.new do |e|
25
- pdf_documents.each do |pdf_reader|
26
- pdf_reader.pages.each do |page|
27
- e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
23
+ def pages
24
+ Enumerator.new do |e|
25
+ pdf_documents.each do |pdf_reader|
26
+ pdf_reader.pages.each do |page|
27
+ e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
28
+ end
29
+ end
28
30
  end
29
31
  end
30
- end
31
- end
32
32
  end
33
33
  end
@@ -3,7 +3,11 @@ require 'elasticsearch'
3
3
  require 'pdf_dir'
4
4
  require 'digest'
5
5
  class PdfSearch::PdfIndex
6
- attr_accessor :daemon
6
+ attr_accessor :daemon
7
+
8
+ class << self
9
+ attr_reader :search_input_fields, :properties
10
+ end
7
11
 
8
12
  def self.create_index
9
13
  ::PdfSearch::ElasticSearchClient.indices.create(
@@ -31,9 +35,29 @@ class PdfSearch::PdfIndex
31
35
  end
32
36
  end
33
37
 
34
- def self.property(property_name, type)
38
+ def self.property(property_name, type, options)
35
39
  @properties ||= {}
36
40
  @properties[property_name] = {type: type}
41
+
42
+ @search_input_fields ||= {}
43
+ @search_input_fields_by_type ||= {}
44
+
45
+ search_input_type = options.delete(:search)
46
+ @search_input_fields[property_name] = search_input_type
47
+ @search_input_fields_by_type[search_input_type] ||= []
48
+ @search_input_fields_by_type[search_input_type].push(property_name)
49
+ end
50
+
51
+ def search_input_fields_by_type
52
+ self.class.instance_variable_get(:@search_input_fields_by_type)
53
+ end
54
+
55
+ def search_input_fields
56
+ self.class.search_input_fields
57
+ end
58
+
59
+ def properties
60
+ self.class.properties
37
61
  end
38
62
 
39
63
  def initialize(pdf_dir)
@@ -1,12 +1,13 @@
1
+ require 'elastic_search_query'
1
2
  require 'pdf_index'
2
3
  require 'daemons'
4
+ require 'search_view'
3
5
  require 'pdf_dir'
4
6
  require 'elasticsearch'
5
7
  require 'web_server'
6
8
 
7
9
  module PdfSearch
8
-
9
- ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
10
+ ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
10
11
 
11
12
  GemDir = [File.dirname(__FILE__), '..']
12
13
 
@@ -14,8 +15,8 @@ module PdfSearch
14
15
  File.expand_path(File.join(GemDir + path_array))
15
16
  end
16
17
 
17
- def self.start_webserver
18
- web_server = WebServer.new
18
+ def self.start_webserver(search_index)
19
+ web_server = WebServer.new(search_index)
19
20
  web_server.start
20
21
  return web_server.daemon.pid.pid
21
22
  end
@@ -0,0 +1,25 @@
1
+ require 'erb'
2
+ require 'pry'
3
+
4
+ module PdfSearch
5
+ class SearchView
6
+ def initialize(search_index)
7
+ @search_index = search_index
8
+ end
9
+
10
+ def search_view_erb_template
11
+ File.read(::PdfSearch.relative_to_gem_path(['html', 'index.html.erb']))
12
+ end
13
+
14
+ def render
15
+ template = ERB.new(search_view_erb_template)
16
+ template.result(binding)
17
+ end
18
+
19
+ private
20
+
21
+ def search_index
22
+ @search_index
23
+ end
24
+ end
25
+ end
@@ -1,9 +1,16 @@
1
1
  require 'webrick'
2
2
  require 'json'
3
+ require 'pry-remote'
4
+
3
5
 
4
6
  class PdfSearch::WebServer
5
7
  attr_accessor :daemon
6
8
 
9
+ def initialize(search_index, debug_mode = false)
10
+ @search_index = search_index
11
+ @debug_mode = (ENV['DEBUG_PDF_SEARCH'] == '1') || debug_mode
12
+ end
13
+
7
14
  def basic_auth
8
15
  return @basic_auth if @basic_auth != nil
9
16
 
@@ -15,31 +22,44 @@ class PdfSearch::WebServer
15
22
 
16
23
  config[:UserDB] = htpasswd
17
24
 
18
- puts config.inspect
19
-
20
25
  @basic_auth = WEBrick::HTTPAuth::BasicAuth.new config
21
- end
26
+ end
27
+
28
+ def index(request, response)
29
+ search_view = ::PdfSearch::SearchView.new(@search_index)
30
+ response.status = 200
31
+ response['Content-Type'] = 'text/html'
32
+ response.body = search_view.render
33
+ end
34
+
35
+ def auth_defined?
36
+ [ENV['PDF_SEARCH_USERNAME'], ENV['PDF_SEARCH_PASSWORD']].all? do |env_var|
37
+ env_var != nil && env_var != ''
38
+ end
39
+ end
40
+
41
+ def search(request, response)
42
+ if auth_defined?
43
+ basic_auth.authenticate(request, response)
44
+ end
45
+
46
+ query = ::PdfSearch::ElasticSearchQuery.new(JSON.parse(request.body), @search_index)
47
+ elastic_response = ::PdfSearch::ElasticSearchClient.search index: 'pdf_pages', body: query.to_hash
48
+
49
+ response.body = response_html(elastic_response)
50
+ end
22
51
 
23
52
  def start
24
53
  start_server = lambda do
25
- server = WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => ::PdfSearch.relative_to_gem_path(['html']))
26
-
27
- server.mount_proc '/search' do |request, response|
28
- if [ENV['PDF_SEARCH_USERNAME'], ENV['PDF_SEARCH_PASSWORD']].all? do |env_var|
29
- env_var != nil && env_var != ''
30
- end
54
+ server = WEBrick::HTTPServer.new(:Port => 80)
31
55
 
32
- basic_auth.authenticate(request, response)
33
- end
34
- query = request.query["query"]
35
- elastic_response = ::PdfSearch::ElasticSearchClient.search q: query, size: 200
36
- response.body = response_html(elastic_response)
37
- end
56
+ server.mount_proc '/', &method(:index)
57
+ server.mount_proc '/search', &method(:search)
38
58
 
39
59
  server.start
40
60
  end
41
61
 
42
- if ENV['DEBUG_PDF_SEARCH'] == '1'
62
+ if @debug_mode
43
63
  start_server.call
44
64
  else
45
65
  self.daemon = Daemons.call(multiple: true, &start_server)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manuel Arno Korfmann
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: elasticsearch-dsl
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: rake
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -118,10 +132,12 @@ extra_rdoc_files: []
118
132
  files:
119
133
  - bin/pdf_search
120
134
  - config/elasticsearch.yml
121
- - html/index.html
135
+ - html/index.html.erb
136
+ - lib/elastic_search_query.rb
122
137
  - lib/pdf_dir.rb
123
138
  - lib/pdf_index.rb
124
139
  - lib/pdf_search.rb
140
+ - lib/search_view.rb
125
141
  - lib/web_server.rb
126
142
  - log/pdf_index.log
127
143
  homepage: https://github.com/banalBI/pdfsearch
@@ -144,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
144
160
  version: '0'
145
161
  requirements: []
146
162
  rubyforge_project:
147
- rubygems_version: 2.7.6
163
+ rubygems_version: 3.0.0.beta3
148
164
  signing_key:
149
165
  specification_version: 4
150
166
  summary: Searching pdfs by leveragin Elasticsearch
@@ -1,20 +0,0 @@
1
- <input id='search'></input>
2
-
3
- <button id='doSearch'>Search</button>
4
-
5
- <ul id='results'>
6
- </ul>
7
-
8
-
9
- <script>
10
- document.getElementById('doSearch').addEventListener('click', function(e) {
11
- let Http = new XMLHttpRequest();
12
- let url= encodeURI(window.location.origin + "/search?query="+document.getElementById('search').value);
13
- Http.open("GET", url);
14
- Http.send();
15
-
16
- Http.onreadystatechange=(e)=> {
17
- document.getElementById('results').innerHTML = Http.responseText;
18
- }
19
- });
20
- </script>