pdf_search 0.0.0 → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/pdf_search +4 -8
- data/config/elasticsearch.yml +2 -0
- data/html/index.html +20 -0
- data/lib/pdf_dir.rb +3 -3
- data/lib/pdf_index.rb +43 -4
- data/lib/pdf_search.rb +54 -0
- data/lib/web_server.rb +33 -0
- data/log/pdf_index.log +0 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7eaa4d55f2ab3006db01072da969f5d9ed910fbc475079ad0319c0f63cf24383
|
4
|
+
data.tar.gz: 824527a902950a7ae324155173665ce4837aa094e60a833df21a897254c265b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8a4de679cfef7215600e55b93df197a49bdb70a70bbbcaf923c40eaeb72f732149a2e02b01650ea67d1fafcc12b17d2fbbf259ffdfc3ab5090eb9df3cb4a06de
|
7
|
+
data.tar.gz: 979b8c13be47593bb847514096f9d67a85958bf98f68d6eb8c5fc51779d1fe7faf8a9212b6db4fa20941ca7be5d2f45c8517342a6277fdeda435f6fef81d05fe
|
data/bin/pdf_search
CHANGED
@@ -1,18 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
require '
|
2
|
+
require 'pdf_search'
|
3
3
|
require 'webrick'
|
4
4
|
|
5
|
-
`sudo service elasticsearch start`
|
6
5
|
|
6
|
+
PdfSearch.create_elasticsearch_index
|
7
|
+
PdfSearch.start_webserver
|
8
|
+
PdfSearch::PdfIndex.start_daemon(ARGV[0])
|
7
9
|
|
8
|
-
gemDir = [File.dirname(__FILE__), '..']
|
9
|
-
indexPdfsScriptPath = File.expand_path(File.join(gemDir + ['lib', 'indexPdfs.rb']))
|
10
|
-
htmlDir = File.expand_path(File.join(gemDir + ['html']))
|
11
10
|
|
12
11
|
|
13
|
-
WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => htmlDir).start
|
14
12
|
|
15
13
|
|
16
|
-
# PdfSearch::Indexer.startDeamon
|
17
|
-
|
18
14
|
|
data/html/index.html
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
<input id='search'></input>
|
2
|
+
|
3
|
+
<button id='doSearch'>Search</button>
|
4
|
+
|
5
|
+
<ul id='results'>
|
6
|
+
</ul>
|
7
|
+
|
8
|
+
|
9
|
+
<script>
|
10
|
+
document.getElementById('doSearch').addEventListener('click', function(e) {
|
11
|
+
let Http = new XMLHttpRequest();
|
12
|
+
let url= encodeURI("http://localhost:80/search?query="+document.getElementById('search').value);
|
13
|
+
Http.open("GET", url);
|
14
|
+
Http.send();
|
15
|
+
|
16
|
+
Http.onreadystatechange=(e)=> {
|
17
|
+
document.getElementById('results').innerHTML = Http.responseText;
|
18
|
+
}
|
19
|
+
});
|
20
|
+
</script>
|
data/lib/pdf_dir.rb
CHANGED
@@ -12,7 +12,7 @@ module PdfSearch
|
|
12
12
|
Dir.glob(File.join(File.expand_path(@dir), '*.pdf'))
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
15
|
+
def pdf_documents
|
16
16
|
Enumerator.new do |e|
|
17
17
|
pdf_file_paths.each do |pdf_file_path|
|
18
18
|
e << PDF::Reader.new(pdf_file_path)
|
@@ -20,9 +20,9 @@ module PdfSearch
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def pages
|
24
24
|
Enumerator.new do |e|
|
25
|
-
|
25
|
+
pdf_documents.each do |pdf_reader|
|
26
26
|
pdf_reader.pages.each do |page|
|
27
27
|
e << page
|
28
28
|
end
|
data/lib/pdf_index.rb
CHANGED
@@ -1,5 +1,44 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
require 'daemons'
|
2
|
+
require 'elasticsearch'
|
3
|
+
require 'pdf_dir'
|
4
|
+
require 'digest'
|
5
|
+
class PdfSearch::PdfIndex
|
6
|
+
attr_accessor :daemon
|
7
|
+
def initialize(pdf_dir)
|
8
|
+
@pdf_dir = pdf_dir
|
9
|
+
@els_client = ::PdfSearch::ElasticSearchClient
|
10
|
+
end
|
11
|
+
def self.start_daemon(dir)
|
12
|
+
pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
|
13
|
+
if ENV['DEBUG_PDF_INDEXING']
|
14
|
+
pdf_index.index_loop
|
15
|
+
else
|
16
|
+
pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop))
|
17
|
+
end
|
18
|
+
pdf_index
|
19
|
+
end
|
20
|
+
def index_loop
|
21
|
+
loop do
|
22
|
+
self.reindex
|
23
|
+
end
|
24
|
+
end
|
25
|
+
def pid
|
26
|
+
daemon.pid.pid
|
27
|
+
end
|
28
|
+
def reindex
|
29
|
+
@pdf_dir.pages.each.with_index do |page, index|
|
30
|
+
begin
|
31
|
+
sleep 0.5
|
32
|
+
@els_client.index(
|
33
|
+
index: 'pdf_pages',
|
34
|
+
type: 'document',
|
35
|
+
id: Digest::SHA256.digest(page.text),
|
36
|
+
body: {
|
37
|
+
text: page.text
|
38
|
+
}
|
39
|
+
)
|
40
|
+
rescue
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
5
44
|
end
|
data/lib/pdf_search.rb
CHANGED
@@ -1,5 +1,59 @@
|
|
1
1
|
require 'pdf_index'
|
2
|
+
require 'daemons'
|
2
3
|
require 'pdf_dir'
|
4
|
+
require 'elasticsearch'
|
5
|
+
require 'web_server'
|
3
6
|
|
4
7
|
module PdfSearch
|
8
|
+
|
9
|
+
ElasticSearchClient = Elasticsearch::Client.new log: true, host: ENV['ELASTICSEARCH_URL']
|
10
|
+
|
11
|
+
GemDir = [File.dirname(__FILE__), '..']
|
12
|
+
|
13
|
+
def self.relative_to_gem_path(path_array)
|
14
|
+
File.expand_path(File.join(GemDir + path_array))
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.start_webserver
|
18
|
+
web_server = WebServer.new
|
19
|
+
web_server.start
|
20
|
+
return web_server.daemon.pid.pid
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.start_elasticsearch
|
24
|
+
`sudo service elasticsearch start`
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.create_elasticsearch_index
|
28
|
+
`curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
|
29
|
+
|
30
|
+
ElasticSearchClient.indices.create(
|
31
|
+
index: 'pdf_pages',
|
32
|
+
body: {
|
33
|
+
mappings: {
|
34
|
+
document: {
|
35
|
+
properties: {
|
36
|
+
text: {
|
37
|
+
type: 'text'
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
}
|
42
|
+
}
|
43
|
+
)
|
44
|
+
|
45
|
+
return true
|
46
|
+
|
47
|
+
rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
|
48
|
+
if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
|
49
|
+
return false
|
50
|
+
else
|
51
|
+
raise e
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.wrap_elastic_request
|
56
|
+
`curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
|
57
|
+
yield
|
58
|
+
end
|
5
59
|
end
|
data/lib/web_server.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'webrick'
|
2
|
+
|
3
|
+
class PdfSearch::WebServer
|
4
|
+
attr_accessor :daemon
|
5
|
+
|
6
|
+
def start
|
7
|
+
start_server = lambda do
|
8
|
+
server = WEBrick::HTTPServer.new(:Port => 80, :DocumentRoot => ::PdfSearch.relative_to_gem_path(['html']))
|
9
|
+
|
10
|
+
server.mount_proc '/search' do |request, response|
|
11
|
+
query = request.query["query"]
|
12
|
+
elastic_response = Net::HTTP.get(URI.parse("#{ENV['ELASTICSEARCH_URL']}/_all/_search?q=text:#{query}"))
|
13
|
+
response.body = response_html(elastic_response)
|
14
|
+
end
|
15
|
+
|
16
|
+
server.start
|
17
|
+
end
|
18
|
+
|
19
|
+
if ENV['DEBUG_PDF_SEARCH'] == '1'
|
20
|
+
start_server.call
|
21
|
+
else
|
22
|
+
self.daemon = Daemons.call(multiple: true, &start_server)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def response_html(elasticsearch_response)
|
27
|
+
results = JSON.parse(elasticsearch_response)['hits']['hits']
|
28
|
+
|
29
|
+
results.map do |result|
|
30
|
+
"<li>#{result["_source"]["text"]}</li>"
|
31
|
+
end.join("\n")
|
32
|
+
end
|
33
|
+
end
|
data/log/pdf_index.log
ADDED
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Arno Korfmann
|
@@ -18,9 +18,13 @@ extensions: []
|
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
20
|
- bin/pdf_search
|
21
|
+
- config/elasticsearch.yml
|
22
|
+
- html/index.html
|
21
23
|
- lib/pdf_dir.rb
|
22
24
|
- lib/pdf_index.rb
|
23
25
|
- lib/pdf_search.rb
|
26
|
+
- lib/web_server.rb
|
27
|
+
- log/pdf_index.log
|
24
28
|
homepage: https://github.com/banalBI/pdfsearch
|
25
29
|
licenses:
|
26
30
|
- MIT
|