pdf_search 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/pdf_search +4 -8
- data/config/elasticsearch.yml +2 -0
- data/html/index.html +20 -0
- data/lib/pdf_dir.rb +3 -3
- data/lib/pdf_index.rb +43 -4
- data/lib/pdf_search.rb +54 -0
- data/lib/web_server.rb +33 -0
- data/log/pdf_index.log +0 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7eaa4d55f2ab3006db01072da969f5d9ed910fbc475079ad0319c0f63cf24383
|
4
|
+
data.tar.gz: 824527a902950a7ae324155173665ce4837aa094e60a833df21a897254c265b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8a4de679cfef7215600e55b93df197a49bdb70a70bbbcaf923c40eaeb72f732149a2e02b01650ea67d1fafcc12b17d2fbbf259ffdfc3ab5090eb9df3cb4a06de
|
7
|
+
data.tar.gz: 979b8c13be47593bb847514096f9d67a85958bf98f68d6eb8c5fc51779d1fe7faf8a9212b6db4fa20941ca7be5d2f45c8517342a6277fdeda435f6fef81d05fe
|
data/bin/pdf_search
CHANGED
@@ -1,18 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
require '
|
2
|
+
require 'pdf_search'
|
3
3
|
require 'webrick'
|
4
4
|
|
5
|
-
`sudo service elasticsearch start`
|
6
5
|
|
6
|
+
PdfSearch.create_elasticsearch_index
|
7
|
+
PdfSearch.start_webserver
|
8
|
+
PdfSearch::PdfIndex.start_daemon(ARGV[0])
|
7
9
|
|
8
|
-
gemDir = [File.dirname(__FILE__), '..']
|
9
|
-
indexPdfsScriptPath = File.expand_path(File.join(gemDir + ['lib', 'indexPdfs.rb']))
|
10
|
-
htmlDir = File.expand_path(File.join(gemDir + ['html']))
|
11
10
|
|
12
11
|
|
13
|
-
WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => htmlDir).start
|
14
12
|
|
15
13
|
|
16
|
-
# PdfSearch::Indexer.startDeamon
|
17
|
-
|
18
14
|
|
data/html/index.html
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
<input id='search'></input>
|
2
|
+
|
3
|
+
<button id='doSearch'>Search</button>
|
4
|
+
|
5
|
+
<ul id='results'>
|
6
|
+
</ul>
|
7
|
+
|
8
|
+
|
9
|
+
<script>
|
10
|
+
document.getElementById('doSearch').addEventListener('click', function(e) {
|
11
|
+
let Http = new XMLHttpRequest();
|
12
|
+
let url= encodeURI("http://localhost:80/search?query="+document.getElementById('search').value);
|
13
|
+
Http.open("GET", url);
|
14
|
+
Http.send();
|
15
|
+
|
16
|
+
Http.onreadystatechange=(e)=> {
|
17
|
+
document.getElementById('results').innerHTML = Http.responseText;
|
18
|
+
}
|
19
|
+
});
|
20
|
+
</script>
|
data/lib/pdf_dir.rb
CHANGED
@@ -12,7 +12,7 @@ module PdfSearch
|
|
12
12
|
Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
15
|
+
def pdf_documents
|
16
16
|
Enumerator.new do |e|
|
17
17
|
pdf_file_paths.each do |pdf_file_path|
|
18
18
|
e << PDF::Reader.new(pdf_file_path)
|
@@ -20,9 +20,9 @@ module PdfSearch
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def pages
|
24
24
|
Enumerator.new do |e|
|
25
|
-
|
25
|
+
pdf_documents.each do |pdf_reader|
|
26
26
|
pdf_reader.pages.each do |page|
|
27
27
|
e << page
|
28
28
|
end
|
data/lib/pdf_index.rb
CHANGED
@@ -1,5 +1,44 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
require 'daemons'
|
2
|
+
require 'elasticsearch'
|
3
|
+
require 'pdf_dir'
|
4
|
+
require 'digest'
|
5
|
+
class PdfSearch::PdfIndex
|
6
|
+
attr_accessor :daemon
|
7
|
+
def initialize(pdf_dir)
|
8
|
+
@pdf_dir = pdf_dir
|
9
|
+
@els_client = ::PdfSearch::ElasticSearchClient
|
10
|
+
end
|
11
|
+
def self.start_daemon(dir)
|
12
|
+
pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
|
13
|
+
if ENV['DEBUG_PDF_INDEXING']
|
14
|
+
pdf_index.index_loop
|
15
|
+
else
|
16
|
+
pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop))
|
17
|
+
end
|
18
|
+
pdf_index
|
19
|
+
end
|
20
|
+
def index_loop
|
21
|
+
loop do
|
22
|
+
self.reindex
|
23
|
+
end
|
24
|
+
end
|
25
|
+
def pid
|
26
|
+
daemon.pid.pid
|
27
|
+
end
|
28
|
+
def reindex
|
29
|
+
@pdf_dir.pages.each.with_index do |page, index|
|
30
|
+
begin
|
31
|
+
sleep 0.5
|
32
|
+
@els_client.index(
|
33
|
+
index: 'pdf_pages',
|
34
|
+
type: 'document',
|
35
|
+
id: Digest::SHA256.digest(page.text),
|
36
|
+
body: {
|
37
|
+
text: page.text
|
38
|
+
}
|
39
|
+
)
|
40
|
+
rescue
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
5
44
|
end
|
data/lib/pdf_search.rb
CHANGED
@@ -1,5 +1,59 @@
|
|
1
1
|
require 'pdf_index'
|
2
|
+
require 'daemons'
|
2
3
|
require 'pdf_dir'
|
4
|
+
require 'elasticsearch'
|
5
|
+
require 'web_server'
|
3
6
|
|
4
7
|
module PdfSearch
|
8
|
+
|
9
|
+
ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
|
10
|
+
|
11
|
+
GemDir = [File.dirname(__FILE__), '..']
|
12
|
+
|
13
|
+
def self.relative_to_gem_path(path_array)
|
14
|
+
File.expand_path(File.join(GemDir + path_array))
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.start_webserver
|
18
|
+
web_server = WebServer.new
|
19
|
+
web_server.start
|
20
|
+
return web_server.daemon.pid.pid
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.start_elasticsearch
|
24
|
+
`sudo service elasticsearch start`
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.create_elasticsearch_index
|
28
|
+
`curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
|
29
|
+
|
30
|
+
ElasticSearchClient.indices.create(
|
31
|
+
index: 'pdf_pages',
|
32
|
+
body: {
|
33
|
+
mappings: {
|
34
|
+
document: {
|
35
|
+
properties: {
|
36
|
+
text: {
|
37
|
+
type: 'text'
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
}
|
42
|
+
}
|
43
|
+
)
|
44
|
+
|
45
|
+
return true
|
46
|
+
|
47
|
+
rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
|
48
|
+
if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
|
49
|
+
return false
|
50
|
+
else
|
51
|
+
raise e
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.wrap_elastic_request
|
56
|
+
`curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
|
57
|
+
yield
|
58
|
+
end
|
5
59
|
end
|
data/lib/web_server.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'webrick'
|
2
|
+
|
3
|
+
class PdfSearch::WebServer
|
4
|
+
attr_accessor :daemon
|
5
|
+
|
6
|
+
def start
|
7
|
+
start_server = lambda do
|
8
|
+
server = WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => ::PdfSearch.relative_to_gem_path(['html']))
|
9
|
+
|
10
|
+
server.mount_proc '/search' do |request, response|
|
11
|
+
query = request.query["query"]
|
12
|
+
elastic_response = Net::HTTP.get(URI.parse("#{ENV['ELASTICSEARCH_URL']}/_all/_search?q=text:#{query}"))
|
13
|
+
response.body = response_html(elastic_response)
|
14
|
+
end
|
15
|
+
|
16
|
+
server.start
|
17
|
+
end
|
18
|
+
|
19
|
+
if ENV['DEBUG_PDF_SEARCH'] == '1'
|
20
|
+
start_server.call
|
21
|
+
else
|
22
|
+
self.daemon = Daemons.call(multiple: true, &start_server)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def response_html(elasticsearch_response)
|
27
|
+
results = JSON.parse(elasticsearch_response)['hits']['hits']
|
28
|
+
|
29
|
+
results.map do |result|
|
30
|
+
"<li>#{result["_source"]["text"]}</li>"
|
31
|
+
end.join("\n")
|
32
|
+
end
|
33
|
+
end
|
data/log/pdf_index.log
ADDED
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Arno Korfmann
|
@@ -18,9 +18,13 @@ extensions: []
|
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
20
|
- bin/pdf_search
|
21
|
+
- config/elasticsearch.yml
|
22
|
+
- html/index.html
|
21
23
|
- lib/pdf_dir.rb
|
22
24
|
- lib/pdf_index.rb
|
23
25
|
- lib/pdf_search.rb
|
26
|
+
- lib/web_server.rb
|
27
|
+
- log/pdf_index.log
|
24
28
|
homepage: https://github.com/banalBI/pdfsearch
|
25
29
|
licenses:
|
26
30
|
- MIT
|