pdf_search 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/pdf_search +16 -2
- data/lib/pdf_dir.rb +2 -1
- data/lib/pdf_index.rb +82 -9
- data/lib/pdf_search.rb +1 -25
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 869c6d11db7398dec2c8786b44fafbfa40d42da81606628afaee57182aeab9da
|
4
|
+
data.tar.gz: af4981100c81a4c83843f7b7132674847df4fc36295bed81fedf12e3e81571eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7da27cf671e6f08d639b3acd8038c3c083472489e87fd222c282ca29208699faf2aa7e029687d974f0a2839f1ca51a77e5d1a6721a115b2d0f71958a108afa3e
|
7
|
+
data.tar.gz: 2d894baedc3aede56943a9933c6d7ca127d846a6e80f5ae1b1b69ce75bf2043056a32b529accae1495b9a707a0fbd0a5db62e015baf88be21d41d518d700f5e6
|
data/bin/pdf_search
CHANGED
@@ -3,9 +3,23 @@ require 'pdf_search'
|
|
3
3
|
require 'webrick'
|
4
4
|
|
5
5
|
|
6
|
-
PdfSearch.create_elasticsearch_index
|
7
6
|
PdfSearch.start_webserver
|
8
|
-
|
7
|
+
|
8
|
+
|
9
|
+
if ARGV[1]
|
10
|
+
require File.expand_path(ARGV[1])
|
11
|
+
|
12
|
+
if ARGV[2] == "" || ARGV[2].nil?
|
13
|
+
raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
|
14
|
+
end
|
15
|
+
|
16
|
+
custom_index = eval(ARGV[2])
|
17
|
+
custom_index.create_index
|
18
|
+
custom_index.start_daemon(ARGV[0])
|
19
|
+
else
|
20
|
+
PdfSearch.create_elasticsearch_index
|
21
|
+
PdfSearch::PdfIndex.start_daemon(ARGV[0])
|
22
|
+
end
|
9
23
|
|
10
24
|
|
11
25
|
|
data/lib/pdf_dir.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#encoding: UTF-8
|
2
2
|
require 'pdf-reader'
|
3
|
+
require 'digest'
|
3
4
|
|
4
5
|
module PdfSearch
|
5
6
|
class PdfDir
|
@@ -23,7 +24,7 @@ module PdfSearch
|
|
23
24
|
Enumerator.new do |e|
|
24
25
|
pdf_documents.each do |pdf_reader|
|
25
26
|
pdf_reader.pages.each do |page|
|
26
|
-
e << page
|
27
|
+
e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
|
27
28
|
end
|
28
29
|
end
|
29
30
|
end
|
data/lib/pdf_index.rb
CHANGED
@@ -4,10 +4,43 @@ require 'pdf_dir'
|
|
4
4
|
require 'digest'
|
5
5
|
class PdfSearch::PdfIndex
|
6
6
|
attr_accessor :daemon
|
7
|
+
|
8
|
+
def self.create_index
|
9
|
+
::PdfSearch::ElasticSearchClient.indices.create(
|
10
|
+
index: 'pdf_pages',
|
11
|
+
body: {
|
12
|
+
mappings: {
|
13
|
+
document: {
|
14
|
+
properties: {
|
15
|
+
text: {
|
16
|
+
type: 'text'
|
17
|
+
}
|
18
|
+
}.merge(@properties ||= {})
|
19
|
+
}
|
20
|
+
}
|
21
|
+
}
|
22
|
+
)
|
23
|
+
|
24
|
+
return true
|
25
|
+
|
26
|
+
rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
|
27
|
+
if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
|
28
|
+
return false
|
29
|
+
else
|
30
|
+
raise e
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.property(property_name, type)
|
35
|
+
@properties ||= {}
|
36
|
+
@properties[property_name] = {type: type}
|
37
|
+
end
|
38
|
+
|
7
39
|
def initialize(pdf_dir)
|
8
40
|
@pdf_dir = pdf_dir
|
9
41
|
@els_client = ::PdfSearch::ElasticSearchClient
|
10
42
|
end
|
43
|
+
|
11
44
|
def self.start_daemon(dir)
|
12
45
|
pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
|
13
46
|
if ENV['DEBUG_PDF_INDEXING']
|
@@ -17,27 +50,67 @@ class PdfSearch::PdfIndex
|
|
17
50
|
end
|
18
51
|
pdf_index
|
19
52
|
end
|
53
|
+
|
20
54
|
def index_loop
|
21
55
|
loop do
|
22
56
|
self.reindex
|
23
57
|
end
|
24
58
|
end
|
59
|
+
|
25
60
|
def pid
|
26
61
|
daemon.pid.pid
|
27
62
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
63
|
+
|
64
|
+
# additional_document_data can be overridden by your custom Index
|
65
|
+
#
|
66
|
+
# class CustomIndex < PdfSearch::PdfIndex
|
67
|
+
# # The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:
|
68
|
+
#
|
69
|
+
# # Name of property TYPE
|
70
|
+
# property :organisation_id, 'string' # or 'text' etc.
|
71
|
+
#
|
72
|
+
# def get_organisation_id
|
73
|
+
# # ...
|
74
|
+
# end
|
75
|
+
#
|
76
|
+
# def additional_document_data(page, reader, pdf_id)
|
77
|
+
# return {
|
78
|
+
# organisation_id: get_organisation_id(pdf_id, page)
|
79
|
+
# }
|
80
|
+
# end
|
81
|
+
# end
|
82
|
+
#
|
83
|
+
def additional_document_data(page, reader, pdf_id)
|
84
|
+
return {}
|
85
|
+
end
|
86
|
+
|
87
|
+
def update_page_document(pdf_id, text, additional_data)
|
88
|
+
end
|
89
|
+
|
90
|
+
def create_page_document(pdf_id, text, additional_data)
|
91
|
+
@els_client.create(
|
33
92
|
index: 'pdf_pages',
|
34
93
|
type: 'document',
|
35
|
-
|
94
|
+
id: combined_pdf_page_id(pdf_id, text),
|
36
95
|
body: {
|
37
|
-
text:
|
38
|
-
}
|
96
|
+
text: text
|
97
|
+
}.merge(additional_data)
|
39
98
|
)
|
40
|
-
|
99
|
+
end
|
100
|
+
|
101
|
+
def combined_pdf_page_id(pdf_id, text)
|
102
|
+
"#{pdf_id}-#{Digest::SHA256.hexdigest(text)}"
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
def reindex
|
107
|
+
@pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index|
|
108
|
+
additional_data = additional_document_data(page, reader, pdf_id)
|
109
|
+
begin
|
110
|
+
create_page_document(pdf_id, page.text, additional_data)
|
111
|
+
rescue Elasticsearch::Transport::Transport::Errors::Conflict => e
|
112
|
+
puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text)
|
113
|
+
}, already indexed"
|
41
114
|
end
|
42
115
|
end
|
43
116
|
end
|
data/lib/pdf_search.rb
CHANGED
@@ -25,31 +25,7 @@ module PdfSearch
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def self.create_elasticsearch_index
|
28
|
-
|
29
|
-
|
30
|
-
ElasticSearchClient.indices.create(
|
31
|
-
index: 'pdf_pages',
|
32
|
-
body: {
|
33
|
-
mappings: {
|
34
|
-
document: {
|
35
|
-
properties: {
|
36
|
-
text: {
|
37
|
-
type: 'text'
|
38
|
-
}
|
39
|
-
}
|
40
|
-
}
|
41
|
-
}
|
42
|
-
}
|
43
|
-
)
|
44
|
-
|
45
|
-
return true
|
46
|
-
|
47
|
-
rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
|
48
|
-
if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
|
49
|
-
return false
|
50
|
-
else
|
51
|
-
raise e
|
52
|
-
end
|
28
|
+
PdfIndex.create_index
|
53
29
|
end
|
54
30
|
|
55
31
|
def self.wrap_elastic_request
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Arno Korfmann
|
@@ -108,7 +108,8 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
-
description: Run in any directory containing pdfs using `$ pdf_search
|
111
|
+
description: 'Run in any directory containing pdfs using `$ pdf_search (optional:
|
112
|
+
<folder name>)`'
|
112
113
|
email: manu@korfmann.info
|
113
114
|
executables:
|
114
115
|
- pdf_search
|