pdf_search 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d33439a4266cb68c08fcf67484f09ecc7ac39228387a8e8b0a54758f1569c84
4
- data.tar.gz: 84f39eb8a771abaf45d5b5d38a1342065ba562a664afa2b8e99dbab1f651624a
3
+ metadata.gz: 869c6d11db7398dec2c8786b44fafbfa40d42da81606628afaee57182aeab9da
4
+ data.tar.gz: af4981100c81a4c83843f7b7132674847df4fc36295bed81fedf12e3e81571eb
5
5
  SHA512:
6
- metadata.gz: 84064889972907aa6c5a81ca08a88be0acf43de33ada6bf3d2dad8c44b90dc2e856554de64b9ab44ea29f62dce0e1e670c335a3f8f283de017460cf6702bf372
7
- data.tar.gz: 3f0396b7533dfe0c8d48abd95a96cbafd9066fcffa937b8d4a71f2fab420f49d33a69ecef5004a6346effd94403ec1584588d660c0709b57cf2e1eeb434c8933
6
+ metadata.gz: 7da27cf671e6f08d639b3acd8038c3c083472489e87fd222c282ca29208699faf2aa7e029687d974f0a2839f1ca51a77e5d1a6721a115b2d0f71958a108afa3e
7
+ data.tar.gz: 2d894baedc3aede56943a9933c6d7ca127d846a6e80f5ae1b1b69ce75bf2043056a32b529accae1495b9a707a0fbd0a5db62e015baf88be21d41d518d700f5e6
data/bin/pdf_search CHANGED
@@ -3,9 +3,23 @@ require 'pdf_search'
3
3
  require 'webrick'
4
4
 
5
5
 
6
- PdfSearch.create_elasticsearch_index
7
6
  PdfSearch.start_webserver
8
- PdfSearch::PdfIndex.start_daemon(ARGV[0])
7
+
8
+
9
+ if ARGV[1]
10
+ require File.expand_path(ARGV[1])
11
+
12
+ if ARGV[2] == "" || ARGV[2].nil?
13
+ raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
14
+ end
15
+
16
+ custom_index = eval(ARGV[2])
17
+ custom_index.create_index
18
+ custom_index.start_daemon(ARGV[0])
19
+ else
20
+ PdfSearch.create_elasticsearch_index
21
+ PdfSearch::PdfIndex.start_daemon(ARGV[0])
22
+ end
9
23
 
10
24
 
11
25
 
data/lib/pdf_dir.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #encoding: UTF-8
2
2
  require 'pdf-reader'
3
+ require 'digest'
3
4
 
4
5
  module PdfSearch
5
6
  class PdfDir
@@ -23,7 +24,7 @@ module PdfSearch
23
24
  Enumerator.new do |e|
24
25
  pdf_documents.each do |pdf_reader|
25
26
  pdf_reader.pages.each do |page|
26
- e << page
27
+ e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
27
28
  end
28
29
  end
29
30
  end
data/lib/pdf_index.rb CHANGED
@@ -4,10 +4,43 @@ require 'pdf_dir'
4
4
  require 'digest'
5
5
  class PdfSearch::PdfIndex
6
6
  attr_accessor :daemon
7
+
8
+ def self.create_index
9
+ ::PdfSearch::ElasticSearchClient.indices.create(
10
+ index: 'pdf_pages',
11
+ body: {
12
+ mappings: {
13
+ document: {
14
+ properties: {
15
+ text: {
16
+ type: 'text'
17
+ }
18
+ }.merge(@properties ||= {})
19
+ }
20
+ }
21
+ }
22
+ )
23
+
24
+ return true
25
+
26
+ rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
27
+ if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
28
+ return false
29
+ else
30
+ raise e
31
+ end
32
+ end
33
+
34
+ def self.property(property_name, type)
35
+ @properties ||= {}
36
+ @properties[property_name] = {type: type}
37
+ end
38
+
7
39
  def initialize(pdf_dir)
8
40
  @pdf_dir = pdf_dir
9
41
  @els_client = ::PdfSearch::ElasticSearchClient
10
42
  end
43
+
11
44
  def self.start_daemon(dir)
12
45
  pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
13
46
  if ENV['DEBUG_PDF_INDEXING']
@@ -17,27 +50,67 @@ class PdfSearch::PdfIndex
17
50
  end
18
51
  pdf_index
19
52
  end
53
+
20
54
  def index_loop
21
55
  loop do
22
56
  self.reindex
23
57
  end
24
58
  end
59
+
25
60
  def pid
26
61
  daemon.pid.pid
27
62
  end
28
- def reindex
29
- @pdf_dir.pages.each.with_index do |page, index|
30
- begin
31
- sleep 0.5
32
- @els_client.index(
63
+
64
+ # additional_document_data can be overridden by your custom Index
65
+ #
66
+ # class CustomIndex < PdfSearch::PdfIndex
67
+ # # The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:
68
+ #
69
+ # # Name of property TYPE
70
+ # property :organisation_id, 'string' # or 'text' etc.
71
+ #
72
+ # def get_organisation_id
73
+ # # ...
74
+ # end
75
+ #
76
+ # def additional_document_data(page, reader, pdf_id)
77
+ # return {
78
+ # organisation_id: get_organisation_id(pdf_id, page)
79
+ # }
80
+ # end
81
+ # end
82
+ #
83
+ def additional_document_data(page, reader, pdf_id)
84
+ return {}
85
+ end
86
+
87
+ def update_page_document(pdf_id, text, additional_data)
88
+ end
89
+
90
+ def create_page_document(pdf_id, text, additional_data)
91
+ @els_client.create(
33
92
  index: 'pdf_pages',
34
93
  type: 'document',
35
- id: Digest::SHA256.digest(page.text),
94
+ id: combined_pdf_page_id(pdf_id, text),
36
95
  body: {
37
- text: page.text
38
- }
96
+ text: text
97
+ }.merge(additional_data)
39
98
  )
40
- rescue
99
+ end
100
+
101
+ def combined_pdf_page_id(pdf_id, text)
102
+ "#{pdf_id}-#{Digest::SHA256.hexdigest(text)}"
103
+ end
104
+
105
+
106
+ def reindex
107
+ @pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index|
108
+ additional_data = additional_document_data(page, reader, pdf_id)
109
+ begin
110
+ create_page_document(pdf_id, page.text, additional_data)
111
+ rescue Elasticsearch::Transport::Transport::Errors::Conflict => e
112
+ puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text)
113
+ }, already indexed"
41
114
  end
42
115
  end
43
116
  end
data/lib/pdf_search.rb CHANGED
@@ -25,31 +25,7 @@ module PdfSearch
25
25
  end
26
26
 
27
27
  def self.create_elasticsearch_index
28
- `curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
29
-
30
- ElasticSearchClient.indices.create(
31
- index: 'pdf_pages',
32
- body: {
33
- mappings: {
34
- document: {
35
- properties: {
36
- text: {
37
- type: 'text'
38
- }
39
- }
40
- }
41
- }
42
- }
43
- )
44
-
45
- return true
46
-
47
- rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
48
- if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
49
- return false
50
- else
51
- raise e
52
- end
28
+ PdfIndex.create_index
53
29
  end
54
30
 
55
31
  def self.wrap_elastic_request
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manuel Arno Korfmann
@@ -108,7 +108,8 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
- description: Run in any directory containing pdfs using `$ pdf_search`
111
+ description: 'Run in any directory containing pdfs using `$ pdf_search (optional:
112
+ <folder name>)`'
112
113
  email: manu@korfmann.info
113
114
  executables:
114
115
  - pdf_search