pdf_search 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 190956cae75ce3b900499a7d1dac12ebe3807078a85610266b25bfbfb4b6f1a1
4
- data.tar.gz: c66b5e9ebbc690b8477e9ddd86cf81252b3c4772afebfc47e1bc1d7c8c4e6c2e
3
+ metadata.gz: 818cadfa2eedfd47ab0ff7d18b19e6bff5da3578a29c85bffa1e2c9e79f25a7f
4
+ data.tar.gz: db1416b33897d06fbf2700516fb1668fded85a2b8c849f350708438e1145fd16
5
5
  SHA512:
6
- metadata.gz: d5508109f2014f2343700ab66c13482d3d5417bc51cbbd3bd1e3a3cd3a6ef108e13722430e9de6078e939ed093cc34048dbf19f5894d6d225fd54c7836adc6cf
7
- data.tar.gz: 1b9c8910e4d95144f8c5f28057697883f2d5da8a8099ebe714289e767841c8ec2579f7d8406126db2c5372049a3ffbdf5367afcba85a96b05e5c5c7bd9b396f6
6
+ metadata.gz: d2a6705f676e96b751d28f4f42eb6fe9073d90524c0515bc2a49437ec80f2650c6e2afd7346f1bb36c3b02e37c5f7419a4450fc304b3accaf5906ef8bb37b7c5
7
+ data.tar.gz: 7c0ffc3861dbcdacf690a741926bd61ad46f545bc761b0c1056f74f7f7d9e9203858809b236a15a52d9731682a0d4caefe23f8fceb1c2f5caee69dc91112c8bb
@@ -2,10 +2,6 @@
2
2
  require 'pdf_search'
3
3
  require 'webrick'
4
4
 
5
-
6
- PdfSearch.start_webserver
7
-
8
-
9
5
  if ARGV[1]
10
6
  require File.expand_path(ARGV[1])
11
7
 
@@ -13,14 +9,17 @@ if ARGV[1]
13
9
  raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
14
10
  end
15
11
 
16
- custom_index = eval(ARGV[2])
17
- custom_index.create_index
18
- custom_index.start_daemon(ARGV[0])
12
+ search_index_class = eval(ARGV[2])
13
+ search_index_class.create_index
14
+ search_index = search_index_class.start_daemon(ARGV[0])
19
15
  else
20
16
  PdfSearch.create_elasticsearch_index
21
- PdfSearch::PdfIndex.start_daemon(ARGV[0])
17
+ search_index = PdfSearch::PdfIndex.start_daemon(ARGV[0])
22
18
  end
23
19
 
20
+ PdfSearch.start_webserver(search_index)
21
+
22
+
24
23
 
25
24
 
26
25
 
@@ -0,0 +1,49 @@
1
+ <h1> Keyword (can be blank) </h1>
2
+ <input id='search' />
3
+ <% if !search_index.search_input_fields.nil? %>
4
+ <% search_index.search_input_fields.each do |search_input_field_name, type| %>
5
+ <% if type == :interval %>
6
+ <h1><%= search_input_field_name %></h1>
7
+
8
+ <h2> Range start </h2>
9
+ <input id="search_<%= search_input_field_name %>_start" />
10
+ <h2> Range end </h2>
11
+ <input id="search_<%= search_input_field_name %>_end" />
12
+ <% end %>
13
+ <% end %>
14
+ <% end %>
15
+
16
+ <button id='doSearch'>Search</button>
17
+
18
+ <ul id='results'>
19
+ </ul>
20
+
21
+
22
+ <script>
23
+ document.getElementById('doSearch').addEventListener('click', function(e) {
24
+ let Http = new XMLHttpRequest();
25
+ let url= encodeURI(window.location.origin + "/search");
26
+
27
+ let data = {};
28
+
29
+ data["search"] = document.getElementById('search').value;
30
+
31
+ <% if !search_index.search_input_fields.nil? %>
32
+ <% search_index.search_input_fields.each do |search_input_field_name, type| %>
33
+ <% if type == :interval %>
34
+ data["search_<%= search_input_field_name %>_start"] = document.getElementById("search_<%= search_input_field_name %>_start").value
35
+ data["search_<%= search_input_field_name %>_end"] = document.getElementById("search_<%= search_input_field_name %>_end").value
36
+ <% end %>
37
+ <% end %>
38
+ <% end %>
39
+
40
+ Http.open("POST", url);
41
+ Http.setRequestHeader('Content-Type', 'application/json; charset=UTF-8');
42
+
43
+ Http.send(JSON.stringify(data));
44
+
45
+ Http.onreadystatechange=(we)=> {
46
+ document.getElementById('results').innerHTML = Http.responseText;
47
+ }
48
+ });
49
+ </script>
@@ -0,0 +1,41 @@
1
+ require 'elasticsearch/dsl'
2
+ module PdfSearch
3
+ class ElasticSearchQuery
4
+ include Elasticsearch::DSL
5
+ attr_reader :query_specification, :search_index
6
+
7
+ def initialize(query_specification, search_index)
8
+ @search_index = search_index
9
+ @query_specification = query_specification
10
+ end
11
+
12
+ def to_hash
13
+ {
14
+ "query": {
15
+ "bool": {
16
+ "must": [
17
+ {
18
+ "match": {
19
+ "text": query_specification['search']
20
+ }
21
+ }
22
+ ].concat(range_queries)
23
+ }
24
+ }
25
+ }
26
+ end
27
+
28
+ def range_queries
29
+ search_index.search_input_fields_by_type[:interval].map do |name|
30
+ {
31
+ "range": {
32
+ name => {
33
+ gte: query_specification["search_#{name}_start"],
34
+ lte: query_specification["search_#{name}_end"]
35
+ }
36
+ }
37
+ }
38
+ end
39
+ end
40
+ end
41
+ end
@@ -4,30 +4,30 @@ require 'digest'
4
4
 
5
5
  module PdfSearch
6
6
  class PdfDir
7
- def initialize(dir = '.')
8
- @dir = dir
9
- end
7
+ def initialize(dir = '.')
8
+ @dir = dir
9
+ end
10
10
 
11
- def pdf_file_paths
12
- Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
13
- end
11
+ def pdf_file_paths
12
+ Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
13
+ end
14
14
 
15
- def pdf_documents
16
- Enumerator.new do |e|
17
- pdf_file_paths.each do |pdf_file_path|
18
- e << PDF::Reader.new(pdf_file_path)
19
- end
20
- end
21
- end
15
+ def pdf_documents
16
+ Enumerator.new do |e|
17
+ pdf_file_paths.each do |pdf_file_path|
18
+ e << PDF::Reader.new(pdf_file_path)
19
+ end
20
+ end
21
+ end
22
22
 
23
- def pages
24
- Enumerator.new do |e|
25
- pdf_documents.each do |pdf_reader|
26
- pdf_reader.pages.each do |page|
27
- e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
23
+ def pages
24
+ Enumerator.new do |e|
25
+ pdf_documents.each do |pdf_reader|
26
+ pdf_reader.pages.each do |page|
27
+ e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
28
+ end
29
+ end
28
30
  end
29
31
  end
30
- end
31
- end
32
32
  end
33
33
  end
@@ -3,7 +3,11 @@ require 'elasticsearch'
3
3
  require 'pdf_dir'
4
4
  require 'digest'
5
5
  class PdfSearch::PdfIndex
6
- attr_accessor :daemon
6
+ attr_accessor :daemon
7
+
8
+ class << self
9
+ attr_reader :search_input_fields, :properties
10
+ end
7
11
 
8
12
  def self.create_index
9
13
  ::PdfSearch::ElasticSearchClient.indices.create(
@@ -31,9 +35,29 @@ class PdfSearch::PdfIndex
31
35
  end
32
36
  end
33
37
 
34
- def self.property(property_name, type)
38
+ def self.property(property_name, type, options)
35
39
  @properties ||= {}
36
40
  @properties[property_name] = {type: type}
41
+
42
+ @search_input_fields ||= {}
43
+ @search_input_fields_by_type ||= {}
44
+
45
+ search_input_type = options.delete(:search)
46
+ @search_input_fields[property_name] = search_input_type
47
+ @search_input_fields_by_type[search_input_type] ||= []
48
+ @search_input_fields_by_type[search_input_type].push(property_name)
49
+ end
50
+
51
+ def search_input_fields_by_type
52
+ self.class.instance_variable_get(:@search_input_fields_by_type)
53
+ end
54
+
55
+ def search_input_fields
56
+ self.class.search_input_fields
57
+ end
58
+
59
+ def properties
60
+ self.class.properties
37
61
  end
38
62
 
39
63
  def initialize(pdf_dir)
@@ -1,12 +1,13 @@
1
+ require 'elastic_search_query'
1
2
  require 'pdf_index'
2
3
  require 'daemons'
4
+ require 'search_view'
3
5
  require 'pdf_dir'
4
6
  require 'elasticsearch'
5
7
  require 'web_server'
6
8
 
7
9
  module PdfSearch
8
-
9
- ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
10
+ ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
10
11
 
11
12
  GemDir = [File.dirname(__FILE__), '..']
12
13
 
@@ -14,8 +15,8 @@ module PdfSearch
14
15
  File.expand_path(File.join(GemDir + path_array))
15
16
  end
16
17
 
17
- def self.start_webserver
18
- web_server = WebServer.new
18
+ def self.start_webserver(search_index)
19
+ web_server = WebServer.new(search_index)
19
20
  web_server.start
20
21
  return web_server.daemon.pid.pid
21
22
  end
@@ -0,0 +1,25 @@
1
+ require 'erb'
2
+ require 'pry'
3
+
4
+ module PdfSearch
5
+ class SearchView
6
+ def initialize(search_index)
7
+ @search_index = search_index
8
+ end
9
+
10
+ def search_view_erb_template
11
+ File.read(::PdfSearch.relative_to_gem_path(['html', 'index.html.erb']))
12
+ end
13
+
14
+ def render
15
+ template = ERB.new(search_view_erb_template)
16
+ template.result(binding)
17
+ end
18
+
19
+ private
20
+
21
+ def search_index
22
+ @search_index
23
+ end
24
+ end
25
+ end
@@ -1,9 +1,16 @@
1
1
  require 'webrick'
2
2
  require 'json'
3
+ require 'pry-remote'
4
+
3
5
 
4
6
  class PdfSearch::WebServer
5
7
  attr_accessor :daemon
6
8
 
9
+ def initialize(search_index, debug_mode = false)
10
+ @search_index = search_index
11
+ @debug_mode = (ENV['DEBUG_PDF_SEARCH'] == '1') || debug_mode
12
+ end
13
+
7
14
  def basic_auth
8
15
  return @basic_auth if @basic_auth != nil
9
16
 
@@ -15,31 +22,44 @@ class PdfSearch::WebServer
15
22
 
16
23
  config[:UserDB] = htpasswd
17
24
 
18
- puts config.inspect
19
-
20
25
  @basic_auth = WEBrick::HTTPAuth::BasicAuth.new config
21
- end
26
+ end
27
+
28
+ def index(request, response)
29
+ search_view = ::PdfSearch::SearchView.new(@search_index)
30
+ response.status = 200
31
+ response['Content-Type'] = 'text/html'
32
+ response.body = search_view.render
33
+ end
34
+
35
+ def auth_defined?
36
+ [ENV['PDF_SEARCH_USERNAME'], ENV['PDF_SEARCH_PASSWORD']].all? do |env_var|
37
+ env_var != nil && env_var != ''
38
+ end
39
+ end
40
+
41
+ def search(request, response)
42
+ if auth_defined?
43
+ basic_auth.authenticate(request, response)
44
+ end
45
+
46
+ query = ::PdfSearch::ElasticSearchQuery.new(JSON.parse(request.body), @search_index)
47
+ elastic_response = ::PdfSearch::ElasticSearchClient.search index: 'pdf_pages', body: query.to_hash
48
+
49
+ response.body = response_html(elastic_response)
50
+ end
22
51
 
23
52
  def start
24
53
  start_server = lambda do
25
- server = WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => ::PdfSearch.relative_to_gem_path(['html']))
26
-
27
- server.mount_proc '/search' do |request, response|
28
- if [ENV['PDF_SEARCH_USERNAME'], ENV['PDF_SEARCH_PASSWORD']].all? do |env_var|
29
- env_var != nil && env_var != ''
30
- end
54
+ server = WEBrick::HTTPServer.new(:Port => 80)
31
55
 
32
- basic_auth.authenticate(request, response)
33
- end
34
- query = request.query["query"]
35
- elastic_response = ::PdfSearch::ElasticSearchClient.search q: query, size: 200
36
- response.body = response_html(elastic_response)
37
- end
56
+ server.mount_proc '/', &method(:index)
57
+ server.mount_proc '/search', &method(:search)
38
58
 
39
59
  server.start
40
60
  end
41
61
 
42
- if ENV['DEBUG_PDF_SEARCH'] == '1'
62
+ if @debug_mode
43
63
  start_server.call
44
64
  else
45
65
  self.daemon = Daemons.call(multiple: true, &start_server)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manuel Arno Korfmann
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: elasticsearch-dsl
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: rake
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -118,10 +132,12 @@ extra_rdoc_files: []
118
132
  files:
119
133
  - bin/pdf_search
120
134
  - config/elasticsearch.yml
121
- - html/index.html
135
+ - html/index.html.erb
136
+ - lib/elastic_search_query.rb
122
137
  - lib/pdf_dir.rb
123
138
  - lib/pdf_index.rb
124
139
  - lib/pdf_search.rb
140
+ - lib/search_view.rb
125
141
  - lib/web_server.rb
126
142
  - log/pdf_index.log
127
143
  homepage: https://github.com/banalBI/pdfsearch
@@ -144,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
144
160
  version: '0'
145
161
  requirements: []
146
162
  rubyforge_project:
147
- rubygems_version: 2.7.6
163
+ rubygems_version: 3.0.0.beta3
148
164
  signing_key:
149
165
  specification_version: 4
150
166
  summary: Searching pdfs by leveragin Elasticsearch
@@ -1,20 +0,0 @@
1
- <input id='search'></input>
2
-
3
- <button id='doSearch'>Search</button>
4
-
5
- <ul id='results'>
6
- </ul>
7
-
8
-
9
- <script>
10
- document.getElementById('doSearch').addEventListener('click', function(e) {
11
- let Http = new XMLHttpRequest();
12
- let url= encodeURI(window.location.origin + "/search?query="+document.getElementById('search').value);
13
- Http.open("GET", url);
14
- Http.send();
15
-
16
- Http.onreadystatechange=(e)=> {
17
- document.getElementById('results').innerHTML = Http.responseText;
18
- }
19
- });
20
- </script>