pdf_search 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/pdf_search +16 -2
- data/lib/pdf_dir.rb +2 -1
- data/lib/pdf_index.rb +82 -9
- data/lib/pdf_search.rb +1 -25
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 869c6d11db7398dec2c8786b44fafbfa40d42da81606628afaee57182aeab9da
|
4
|
+
data.tar.gz: af4981100c81a4c83843f7b7132674847df4fc36295bed81fedf12e3e81571eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7da27cf671e6f08d639b3acd8038c3c083472489e87fd222c282ca29208699faf2aa7e029687d974f0a2839f1ca51a77e5d1a6721a115b2d0f71958a108afa3e
|
7
|
+
data.tar.gz: 2d894baedc3aede56943a9933c6d7ca127d846a6e80f5ae1b1b69ce75bf2043056a32b529accae1495b9a707a0fbd0a5db62e015baf88be21d41d518d700f5e6
|
data/bin/pdf_search
CHANGED
@@ -3,9 +3,23 @@ require 'pdf_search'
|
|
3
3
|
require 'webrick'
|
4
4
|
|
5
5
|
|
6
|
-
PdfSearch.create_elasticsearch_index
|
7
6
|
PdfSearch.start_webserver
|
8
|
-
|
7
|
+
|
8
|
+
|
9
|
+
if ARGV[1]
|
10
|
+
require File.expand_path(ARGV[1])
|
11
|
+
|
12
|
+
if ARGV[2] == "" || ARGV[2].nil?
|
13
|
+
raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
|
14
|
+
end
|
15
|
+
|
16
|
+
custom_index = eval(ARGV[2])
|
17
|
+
custom_index.create_index
|
18
|
+
custom_index.start_daemon(ARGV[0])
|
19
|
+
else
|
20
|
+
PdfSearch.create_elasticsearch_index
|
21
|
+
PdfSearch::PdfIndex.start_daemon(ARGV[0])
|
22
|
+
end
|
9
23
|
|
10
24
|
|
11
25
|
|
data/lib/pdf_dir.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#encoding: UTF-8
|
2
2
|
require 'pdf-reader'
|
3
|
+
require 'digest'
|
3
4
|
|
4
5
|
module PdfSearch
|
5
6
|
class PdfDir
|
@@ -23,7 +24,7 @@ module PdfSearch
|
|
23
24
|
Enumerator.new do |e|
|
24
25
|
pdf_documents.each do |pdf_reader|
|
25
26
|
pdf_reader.pages.each do |page|
|
26
|
-
e << page
|
27
|
+
e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
|
27
28
|
end
|
28
29
|
end
|
29
30
|
end
|
data/lib/pdf_index.rb
CHANGED
@@ -4,10 +4,43 @@ require 'pdf_dir'
|
|
4
4
|
require 'digest'
|
5
5
|
class PdfSearch::PdfIndex
|
6
6
|
attr_accessor :daemon
|
7
|
+
|
8
|
+
def self.create_index
|
9
|
+
::PdfSearch::ElasticSearchClient.indices.create(
|
10
|
+
index: 'pdf_pages',
|
11
|
+
body: {
|
12
|
+
mappings: {
|
13
|
+
document: {
|
14
|
+
properties: {
|
15
|
+
text: {
|
16
|
+
type: 'text'
|
17
|
+
}
|
18
|
+
}.merge(@properties ||= {})
|
19
|
+
}
|
20
|
+
}
|
21
|
+
}
|
22
|
+
)
|
23
|
+
|
24
|
+
return true
|
25
|
+
|
26
|
+
rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
|
27
|
+
if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
|
28
|
+
return false
|
29
|
+
else
|
30
|
+
raise e
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.property(property_name, type)
|
35
|
+
@properties ||= {}
|
36
|
+
@properties[property_name] = {type: type}
|
37
|
+
end
|
38
|
+
|
7
39
|
def initialize(pdf_dir)
|
8
40
|
@pdf_dir = pdf_dir
|
9
41
|
@els_client = ::PdfSearch::ElasticSearchClient
|
10
42
|
end
|
43
|
+
|
11
44
|
def self.start_daemon(dir)
|
12
45
|
pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
|
13
46
|
if ENV['DEBUG_PDF_INDEXING']
|
@@ -17,27 +50,67 @@ class PdfSearch::PdfIndex
|
|
17
50
|
end
|
18
51
|
pdf_index
|
19
52
|
end
|
53
|
+
|
20
54
|
def index_loop
|
21
55
|
loop do
|
22
56
|
self.reindex
|
23
57
|
end
|
24
58
|
end
|
59
|
+
|
25
60
|
def pid
|
26
61
|
daemon.pid.pid
|
27
62
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
63
|
+
|
64
|
+
# additional_document_data can be overridden by your custom Index
|
65
|
+
#
|
66
|
+
# class CustomIndex < PdfSearch::PdfIndex
|
67
|
+
# # The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:
|
68
|
+
#
|
69
|
+
# # Name of property TYPE
|
70
|
+
# property :organisation_id, 'string' # or 'text' etc.
|
71
|
+
#
|
72
|
+
# def get_organisation_id
|
73
|
+
# # ...
|
74
|
+
# end
|
75
|
+
#
|
76
|
+
# def additional_document_data(page, reader, pdf_id)
|
77
|
+
# return {
|
78
|
+
# organisation_id: get_organisation_id(pdf_id, page)
|
79
|
+
# }
|
80
|
+
# end
|
81
|
+
# end
|
82
|
+
#
|
83
|
+
def additional_document_data(page, reader, pdf_id)
|
84
|
+
return {}
|
85
|
+
end
|
86
|
+
|
87
|
+
def update_page_document(pdf_id, text, additional_data)
|
88
|
+
end
|
89
|
+
|
90
|
+
def create_page_document(pdf_id, text, additional_data)
|
91
|
+
@els_client.create(
|
33
92
|
index: 'pdf_pages',
|
34
93
|
type: 'document',
|
35
|
-
|
94
|
+
id: combined_pdf_page_id(pdf_id, text),
|
36
95
|
body: {
|
37
|
-
text:
|
38
|
-
}
|
96
|
+
text: text
|
97
|
+
}.merge(additional_data)
|
39
98
|
)
|
40
|
-
|
99
|
+
end
|
100
|
+
|
101
|
+
def combined_pdf_page_id(pdf_id, text)
|
102
|
+
"#{pdf_id}-#{Digest::SHA256.hexdigest(text)}"
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
def reindex
|
107
|
+
@pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index|
|
108
|
+
additional_data = additional_document_data(page, reader, pdf_id)
|
109
|
+
begin
|
110
|
+
create_page_document(pdf_id, page.text, additional_data)
|
111
|
+
rescue Elasticsearch::Transport::Transport::Errors::Conflict => e
|
112
|
+
puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text)
|
113
|
+
}, already indexed"
|
41
114
|
end
|
42
115
|
end
|
43
116
|
end
|
data/lib/pdf_search.rb
CHANGED
@@ -25,31 +25,7 @@ module PdfSearch
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def self.create_elasticsearch_index
|
28
|
-
|
29
|
-
|
30
|
-
ElasticSearchClient.indices.create(
|
31
|
-
index: 'pdf_pages',
|
32
|
-
body: {
|
33
|
-
mappings: {
|
34
|
-
document: {
|
35
|
-
properties: {
|
36
|
-
text: {
|
37
|
-
type: 'text'
|
38
|
-
}
|
39
|
-
}
|
40
|
-
}
|
41
|
-
}
|
42
|
-
}
|
43
|
-
)
|
44
|
-
|
45
|
-
return true
|
46
|
-
|
47
|
-
rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
|
48
|
-
if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
|
49
|
-
return false
|
50
|
-
else
|
51
|
-
raise e
|
52
|
-
end
|
28
|
+
PdfIndex.create_index
|
53
29
|
end
|
54
30
|
|
55
31
|
def self.wrap_elastic_request
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Arno Korfmann
|
@@ -108,7 +108,8 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
-
description: Run in any directory containing pdfs using `$ pdf_search
|
111
|
+
description: 'Run in any directory containing pdfs using `$ pdf_search (optional:
|
112
|
+
<folder name>)`'
|
112
113
|
email: manu@korfmann.info
|
113
114
|
executables:
|
114
115
|
- pdf_search
|