pdf_search 0.0.0 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5506277019033c54c84b0aeec3deae307b304af0392f17bd7e5f1147feb07ad8
4
- data.tar.gz: d0488eefb43bdd4cbba1e9bfc0656eefd3e4ff41762b70d4f1e8dc0558fd73c3
3
+ metadata.gz: 7eaa4d55f2ab3006db01072da969f5d9ed910fbc475079ad0319c0f63cf24383
4
+ data.tar.gz: 824527a902950a7ae324155173665ce4837aa094e60a833df21a897254c265b6
5
5
  SHA512:
6
- metadata.gz: 8a20e3fffff896a967b395bae72483d99f733bf1bc1c949c3ac0f4426bd066550741c3b9e664ce71b633fb0b4e2f5c34f958123cb758deb181aaabbd5e1b8bd0
7
- data.tar.gz: a2555c899aba3123f5f99b985590d75927515044111d8a7556fae92848b6a39dc335f6aa29aa1d6a3e0dada710e2e8e07e0599277240f783faff489c3b52c699
6
+ metadata.gz: 8a4de679cfef7215600e55b93df197a49bdb70a70bbbcaf923c40eaeb72f732149a2e02b01650ea67d1fafcc12b17d2fbbf259ffdfc3ab5090eb9df3cb4a06de
7
+ data.tar.gz: 979b8c13be47593bb847514096f9d67a85958bf98f68d6eb8c5fc51779d1fe7faf8a9212b6db4fa20941ca7be5d2f45c8517342a6277fdeda435f6fef81d05fe
data/bin/pdf_search CHANGED
@@ -1,18 +1,14 @@
1
1
  #!/usr/bin/env ruby
2
- require 'pdfsearch'
2
+ require 'pdf_search'
3
3
  require 'webrick'
4
4
 
5
- `sudo service elasticsearch start`
6
5
 
6
+ PdfSearch.create_elasticsearch_index
7
+ PdfSearch.start_webserver
8
+ PdfSearch::PdfIndex.start_daemon(ARGV[0])
7
9
 
8
- gemDir = [File.dirname(__FILE__), '..']
9
- indexPdfsScriptPath = File.expand_path(File.join(gemDir + ['lib', 'indexPdfs.rb']))
10
- htmlDir = File.expand_path(File.join(gemDir + ['html']))
11
10
 
12
11
 
13
- WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => htmlDir).start
14
12
 
15
13
 
16
- # PdfSearch::Indexer.startDeamon
17
-
18
14
 
@@ -0,0 +1,2 @@
1
+ cluster.routing.allocation.disk.threshold_enabled: false
2
+
data/html/index.html ADDED
@@ -0,0 +1,20 @@
1
+ <input id='search'></input>
2
+
3
+ <button id='doSearch'>Search</button>
4
+
5
+ <ul id='results'>
6
+ </ul>
7
+
8
+
9
+ <script>
10
+ document.getElementById('doSearch').addEventListener('click', function(e) {
11
+ let Http = new XMLHttpRequest();
12
+ let url= encodeURI("http://localhost:80/search?query="+document.getElementById('search').value);
13
+ Http.open("GET", url);
14
+ Http.send();
15
+
16
+ Http.onreadystatechange=(e)=> {
17
+ document.getElementById('results').innerHTML = Http.responseText;
18
+ }
19
+ });
20
+ </script>
data/lib/pdf_dir.rb CHANGED
@@ -12,7 +12,7 @@ module PdfSearch
12
12
  Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
13
13
  end
14
14
 
15
- def each_pdf
15
+ def pdf_documents
16
16
  Enumerator.new do |e|
17
17
  pdf_file_paths.each do |pdf_file_path|
18
18
  e << PDF::Reader.new(pdf_file_path)
@@ -20,9 +20,9 @@ module PdfSearch
20
20
  end
21
21
  end
22
22
 
23
- def each_page
23
+ def pages
24
24
  Enumerator.new do |e|
25
- each_pdf do |pdf_reader|
25
+ pdf_documents.each do |pdf_reader|
26
26
  pdf_reader.pages.each do |page|
27
27
  e << page
28
28
  end
data/lib/pdf_index.rb CHANGED
@@ -1,5 +1,44 @@
1
- require 'pdfiterator'
2
-
3
- PdfIterator.new.traverse_current_directory_pdf_texts do |pdf_text|
4
- `echo "#{pdf_text}" >> log/indexPdfs.log`
1
+ require 'daemons'
2
+ require 'elasticsearch'
3
+ require 'pdf_dir'
4
+ require 'digest'
5
+ class PdfSearch::PdfIndex
6
+ attr_accessor :daemon
7
+ def initialize(pdf_dir)
8
+ @pdf_dir = pdf_dir
9
+ @els_client = ::PdfSearch::ElasticSearchClient
10
+ end
11
+ def self.start_daemon(dir)
12
+ pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
13
+ if ENV['DEBUG_PDF_INDEXING']
14
+ pdf_index.index_loop
15
+ else
16
+ pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop))
17
+ end
18
+ pdf_index
19
+ end
20
+ def index_loop
21
+ loop do
22
+ self.reindex
23
+ end
24
+ end
25
+ def pid
26
+ daemon.pid.pid
27
+ end
28
+ def reindex
29
+ @pdf_dir.pages.each.with_index do |page, index|
30
+ begin
31
+ sleep 0.5
32
+ @els_client.index(
33
+ index: 'pdf_pages',
34
+ type: 'document',
35
+ id: Digest::SHA256.digest(page.text),
36
+ body: {
37
+ text: page.text
38
+ }
39
+ )
40
+ rescue
41
+ end
42
+ end
43
+ end
5
44
  end
data/lib/pdf_search.rb CHANGED
@@ -1,5 +1,59 @@
1
1
  require 'pdf_index'
2
+ require 'daemons'
2
3
  require 'pdf_dir'
4
+ require 'elasticsearch'
5
+ require 'web_server'
3
6
 
4
7
  module PdfSearch
8
+
9
+ ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
10
+
11
+ GemDir = [File.dirname(__FILE__), '..']
12
+
13
+ def self.relative_to_gem_path(path_array)
14
+ File.expand_path(File.join(GemDir + path_array))
15
+ end
16
+
17
+ def self.start_webserver
18
+ web_server = WebServer.new
19
+ web_server.start
20
+ return web_server.daemon.pid.pid
21
+ end
22
+
23
+ def self.start_elasticsearch
24
+ `sudo service elasticsearch start`
25
+ end
26
+
27
+ def self.create_elasticsearch_index
28
+ `curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
29
+
30
+ ElasticSearchClient.indices.create(
31
+ index: 'pdf_pages',
32
+ body: {
33
+ mappings: {
34
+ document: {
35
+ properties: {
36
+ text: {
37
+ type: 'text'
38
+ }
39
+ }
40
+ }
41
+ }
42
+ }
43
+ )
44
+
45
+ return true
46
+
47
+ rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
48
+ if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
49
+ return false
50
+ else
51
+ raise e
52
+ end
53
+ end
54
+
55
+ def self.wrap_elastic_request
56
+ `curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
57
+ yield
58
+ end
5
59
  end
data/lib/web_server.rb ADDED
@@ -0,0 +1,33 @@
1
+ require 'webrick'
2
+
3
+ class PdfSearch::WebServer
4
+ attr_accessor :daemon
5
+
6
+ def start
7
+ start_server = lambda do
8
+ server = WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => ::PdfSearch.relative_to_gem_path(['html']))
9
+
10
+ server.mount_proc '/search' do |request, response|
11
+ query = request.query["query"]
12
+ elastic_response = Net::HTTP.get(URI.parse("#{ENV['ELASTICSEARCH_URL']}/_all/_search?q=text:#{query}"))
13
+ response.body = response_html(elastic_response)
14
+ end
15
+
16
+ server.start
17
+ end
18
+
19
+ if ENV['DEBUG_PDF_SEARCH'] == '1'
20
+ start_server.call
21
+ else
22
+ self.daemon = Daemons.call(multiple: true, &start_server)
23
+ end
24
+ end
25
+
26
+ def response_html(elasticsearch_response)
27
+ results = JSON.parse(elasticsearch_response)['hits']['hits']
28
+
29
+ results.map do |result|
30
+ "<li>#{result["_source"]["text"]}</li>"
31
+ end.join("\n")
32
+ end
33
+ end
data/log/pdf_index.log ADDED
File without changes
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manuel Arno Korfmann
@@ -18,9 +18,13 @@ extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
20
  - bin/pdf_search
21
+ - config/elasticsearch.yml
22
+ - html/index.html
21
23
  - lib/pdf_dir.rb
22
24
  - lib/pdf_index.rb
23
25
  - lib/pdf_search.rb
26
+ - lib/web_server.rb
27
+ - log/pdf_index.log
24
28
  homepage: https://github.com/banalBI/pdfsearch
25
29
  licenses:
26
30
  - MIT