pdf_search 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5506277019033c54c84b0aeec3deae307b304af0392f17bd7e5f1147feb07ad8
4
- data.tar.gz: d0488eefb43bdd4cbba1e9bfc0656eefd3e4ff41762b70d4f1e8dc0558fd73c3
3
+ metadata.gz: 7eaa4d55f2ab3006db01072da969f5d9ed910fbc475079ad0319c0f63cf24383
4
+ data.tar.gz: 824527a902950a7ae324155173665ce4837aa094e60a833df21a897254c265b6
5
5
  SHA512:
6
- metadata.gz: 8a20e3fffff896a967b395bae72483d99f733bf1bc1c949c3ac0f4426bd066550741c3b9e664ce71b633fb0b4e2f5c34f958123cb758deb181aaabbd5e1b8bd0
7
- data.tar.gz: a2555c899aba3123f5f99b985590d75927515044111d8a7556fae92848b6a39dc335f6aa29aa1d6a3e0dada710e2e8e07e0599277240f783faff489c3b52c699
6
+ metadata.gz: 8a4de679cfef7215600e55b93df197a49bdb70a70bbbcaf923c40eaeb72f732149a2e02b01650ea67d1fafcc12b17d2fbbf259ffdfc3ab5090eb9df3cb4a06de
7
+ data.tar.gz: 979b8c13be47593bb847514096f9d67a85958bf98f68d6eb8c5fc51779d1fe7faf8a9212b6db4fa20941ca7be5d2f45c8517342a6277fdeda435f6fef81d05fe
data/bin/pdf_search CHANGED
@@ -1,18 +1,14 @@
1
1
  #!/usr/bin/env ruby
2
- require 'pdfsearch'
2
+ require 'pdf_search'
3
3
  require 'webrick'
4
4
 
5
- `sudo service elasticsearch start`
6
5
 
6
+ PdfSearch.create_elasticsearch_index
7
+ PdfSearch.start_webserver
8
+ PdfSearch::PdfIndex.start_daemon(ARGV[0])
7
9
 
8
- gemDir = [File.dirname(__FILE__), '..']
9
- indexPdfsScriptPath = File.expand_path(File.join(gemDir + ['lib', 'indexPdfs.rb']))
10
- htmlDir = File.expand_path(File.join(gemDir + ['html']))
11
10
 
12
11
 
13
- WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => htmlDir).start
14
12
 
15
13
 
16
- # PdfSearch::Indexer.startDeamon
17
-
18
14
 
@@ -0,0 +1,2 @@
1
+ cluster.routing.allocation.disk.threshold_enabled: false
2
+
data/html/index.html ADDED
@@ -0,0 +1,20 @@
1
+ <input id='search'></input>
2
+
3
+ <button id='doSearch'>Search</button>
4
+
5
+ <ul id='results'>
6
+ </ul>
7
+
8
+
9
+ <script>
10
+ document.getElementById('doSearch').addEventListener('click', function(e) {
11
+ let Http = new XMLHttpRequest();
12
+ let url= encodeURI("http://localhost:80/search?query="+document.getElementById('search').value);
13
+ Http.open("GET", url);
14
+ Http.send();
15
+
16
+ Http.onreadystatechange=(e)=> {
17
+ document.getElementById('results').innerHTML = Http.responseText;
18
+ }
19
+ });
20
+ </script>
data/lib/pdf_dir.rb CHANGED
@@ -12,7 +12,7 @@ module PdfSearch
12
12
  Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
13
13
  end
14
14
 
15
- def each_pdf
15
+ def pdf_documents
16
16
  Enumerator.new do |e|
17
17
  pdf_file_paths.each do |pdf_file_path|
18
18
  e << PDF::Reader.new(pdf_file_path)
@@ -20,9 +20,9 @@ module PdfSearch
20
20
  end
21
21
  end
22
22
 
23
- def each_page
23
+ def pages
24
24
  Enumerator.new do |e|
25
- each_pdf do |pdf_reader|
25
+ pdf_documents.each do |pdf_reader|
26
26
  pdf_reader.pages.each do |page|
27
27
  e << page
28
28
  end
data/lib/pdf_index.rb CHANGED
@@ -1,5 +1,44 @@
1
- require 'pdfiterator'
2
-
3
- PdfIterator.new.traverse_current_directory_pdf_texts do |pdf_text|
4
- `echo "#{pdf_text}" >> log/indexPdfs.log`
1
+ require 'daemons'
2
+ require 'elasticsearch'
3
+ require 'pdf_dir'
4
+ require 'digest'
5
+ class PdfSearch::PdfIndex
6
+ attr_accessor :daemon
7
+ def initialize(pdf_dir)
8
+ @pdf_dir = pdf_dir
9
+ @els_client = ::PdfSearch::ElasticSearchClient
10
+ end
11
+ def self.start_daemon(dir)
12
+ pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
13
+ if ENV['DEBUG_PDF_INDEXING']
14
+ pdf_index.index_loop
15
+ else
16
+ pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop))
17
+ end
18
+ pdf_index
19
+ end
20
+ def index_loop
21
+ loop do
22
+ self.reindex
23
+ end
24
+ end
25
+ def pid
26
+ daemon.pid.pid
27
+ end
28
+ def reindex
29
+ @pdf_dir.pages.each.with_index do |page, index|
30
+ begin
31
+ sleep 0.5
32
+ @els_client.index(
33
+ index: 'pdf_pages',
34
+ type: 'document',
35
+ id: Digest::SHA256.digest(page.text),
36
+ body: {
37
+ text: page.text
38
+ }
39
+ )
40
+ rescue
41
+ end
42
+ end
43
+ end
5
44
  end
data/lib/pdf_search.rb CHANGED
@@ -1,5 +1,59 @@
1
1
  require 'pdf_index'
2
+ require 'daemons'
2
3
  require 'pdf_dir'
4
+ require 'elasticsearch'
5
+ require 'web_server'
3
6
 
4
7
  module PdfSearch
8
+
9
+ ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
10
+
11
+ GemDir = [File.dirname(__FILE__), '..']
12
+
13
+ def self.relative_to_gem_path(path_array)
14
+ File.expand_path(File.join(GemDir + path_array))
15
+ end
16
+
17
+ def self.start_webserver
18
+ web_server = WebServer.new
19
+ web_server.start
20
+ return web_server.daemon.pid.pid
21
+ end
22
+
23
+ def self.start_elasticsearch
24
+ `sudo service elasticsearch start`
25
+ end
26
+
27
+ def self.create_elasticsearch_index
28
+ `curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
29
+
30
+ ElasticSearchClient.indices.create(
31
+ index: 'pdf_pages',
32
+ body: {
33
+ mappings: {
34
+ document: {
35
+ properties: {
36
+ text: {
37
+ type: 'text'
38
+ }
39
+ }
40
+ }
41
+ }
42
+ }
43
+ )
44
+
45
+ return true
46
+
47
+ rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
48
+ if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
49
+ return false
50
+ else
51
+ raise e
52
+ end
53
+ end
54
+
55
+ def self.wrap_elastic_request
56
+ `curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
57
+ yield
58
+ end
5
59
  end
data/lib/web_server.rb ADDED
@@ -0,0 +1,33 @@
1
+ require 'webrick'
2
+
3
+ class PdfSearch::WebServer
4
+ attr_accessor :daemon
5
+
6
+ def start
7
+ start_server = lambda do
8
+ server = WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => ::PdfSearch.relative_to_gem_path(['html']))
9
+
10
+ server.mount_proc '/search' do |request, response|
11
+ query = request.query["query"]
12
+ elastic_response = Net::HTTP.get(URI.parse("#{ENV['ELASTICSEARCH_URL']}/_all/_search?q=text:#{query}"))
13
+ response.body = response_html(elastic_response)
14
+ end
15
+
16
+ server.start
17
+ end
18
+
19
+ if ENV['DEBUG_PDF_SEARCH'] == '1'
20
+ start_server.call
21
+ else
22
+ self.daemon = Daemons.call(multiple: true, &start_server)
23
+ end
24
+ end
25
+
26
+ def response_html(elasticsearch_response)
27
+ results = JSON.parse(elasticsearch_response)['hits']['hits']
28
+
29
+ results.map do |result|
30
+ "<li>#{result["_source"]["text"]}</li>"
31
+ end.join("\n")
32
+ end
33
+ end
data/log/pdf_index.log ADDED
File without changes
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manuel Arno Korfmann
@@ -18,9 +18,13 @@ extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
20
  - bin/pdf_search
21
+ - config/elasticsearch.yml
22
+ - html/index.html
21
23
  - lib/pdf_dir.rb
22
24
  - lib/pdf_index.rb
23
25
  - lib/pdf_search.rb
26
+ - lib/web_server.rb
27
+ - log/pdf_index.log
24
28
  homepage: https://github.com/banalBI/pdfsearch
25
29
  licenses:
26
30
  - MIT