pdf_search 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d33439a4266cb68c08fcf67484f09ecc7ac39228387a8e8b0a54758f1569c84
4
- data.tar.gz: 84f39eb8a771abaf45d5b5d38a1342065ba562a664afa2b8e99dbab1f651624a
3
+ metadata.gz: 869c6d11db7398dec2c8786b44fafbfa40d42da81606628afaee57182aeab9da
4
+ data.tar.gz: af4981100c81a4c83843f7b7132674847df4fc36295bed81fedf12e3e81571eb
5
5
  SHA512:
6
- metadata.gz: 84064889972907aa6c5a81ca08a88be0acf43de33ada6bf3d2dad8c44b90dc2e856554de64b9ab44ea29f62dce0e1e670c335a3f8f283de017460cf6702bf372
7
- data.tar.gz: 3f0396b7533dfe0c8d48abd95a96cbafd9066fcffa937b8d4a71f2fab420f49d33a69ecef5004a6346effd94403ec1584588d660c0709b57cf2e1eeb434c8933
6
+ metadata.gz: 7da27cf671e6f08d639b3acd8038c3c083472489e87fd222c282ca29208699faf2aa7e029687d974f0a2839f1ca51a77e5d1a6721a115b2d0f71958a108afa3e
7
+ data.tar.gz: 2d894baedc3aede56943a9933c6d7ca127d846a6e80f5ae1b1b69ce75bf2043056a32b529accae1495b9a707a0fbd0a5db62e015baf88be21d41d518d700f5e6
data/bin/pdf_search CHANGED
@@ -3,9 +3,23 @@ require 'pdf_search'
3
3
  require 'webrick'
4
4
 
5
5
 
6
- PdfSearch.create_elasticsearch_index
7
6
  PdfSearch.start_webserver
8
- PdfSearch::PdfIndex.start_daemon(ARGV[0])
7
+
8
+
9
+ if ARGV[1]
10
+ require File.expand_path(ARGV[1])
11
+
12
+ if ARGV[2] == "" || ARGV[2].nil?
13
+ raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
14
+ end
15
+
16
+ custom_index = eval(ARGV[2])
17
+ custom_index.create_index
18
+ custom_index.start_daemon(ARGV[0])
19
+ else
20
+ PdfSearch.create_elasticsearch_index
21
+ PdfSearch::PdfIndex.start_daemon(ARGV[0])
22
+ end
9
23
 
10
24
 
11
25
 
data/lib/pdf_dir.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #encoding: UTF-8
2
2
  require 'pdf-reader'
3
+ require 'digest'
3
4
 
4
5
  module PdfSearch
5
6
  class PdfDir
@@ -23,7 +24,7 @@ module PdfSearch
23
24
  Enumerator.new do |e|
24
25
  pdf_documents.each do |pdf_reader|
25
26
  pdf_reader.pages.each do |page|
26
- e << page
27
+ e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
27
28
  end
28
29
  end
29
30
  end
data/lib/pdf_index.rb CHANGED
@@ -4,10 +4,43 @@ require 'pdf_dir'
4
4
  require 'digest'
5
5
  class PdfSearch::PdfIndex
6
6
  attr_accessor :daemon
7
+
8
+ def self.create_index
9
+ ::PdfSearch::ElasticSearchClient.indices.create(
10
+ index: 'pdf_pages',
11
+ body: {
12
+ mappings: {
13
+ document: {
14
+ properties: {
15
+ text: {
16
+ type: 'text'
17
+ }
18
+ }.merge(@properties ||= {})
19
+ }
20
+ }
21
+ }
22
+ )
23
+
24
+ return true
25
+
26
+ rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
27
+ if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
28
+ return false
29
+ else
30
+ raise e
31
+ end
32
+ end
33
+
34
+ def self.property(property_name, type)
35
+ @properties ||= {}
36
+ @properties[property_name] = {type: type}
37
+ end
38
+
7
39
  def initialize(pdf_dir)
8
40
  @pdf_dir = pdf_dir
9
41
  @els_client = ::PdfSearch::ElasticSearchClient
10
42
  end
43
+
11
44
  def self.start_daemon(dir)
12
45
  pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
13
46
  if ENV['DEBUG_PDF_INDEXING']
@@ -17,27 +50,67 @@ class PdfSearch::PdfIndex
17
50
  end
18
51
  pdf_index
19
52
  end
53
+
20
54
  def index_loop
21
55
  loop do
22
56
  self.reindex
23
57
  end
24
58
  end
59
+
25
60
  def pid
26
61
  daemon.pid.pid
27
62
  end
28
- def reindex
29
- @pdf_dir.pages.each.with_index do |page, index|
30
- begin
31
- sleep 0.5
32
- @els_client.index(
63
+
64
+ # additional_document_data can be overridden by your custom Index
65
+ #
66
+ # class CustomIndex < PdfSearch::PdfIndex
67
+ # # The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:
68
+ #
69
+ # # Name of property TYPE
70
+ # property :organisation_id, 'string' # or 'text' etc.
71
+ #
72
+ # def get_organisation_id
73
+ # # ...
74
+ # end
75
+ #
76
+ # def additional_document_data(page, reader, pdf_id)
77
+ # return {
78
+ # organisation_id: get_organisation_id(pdf_id, page)
79
+ # }
80
+ # end
81
+ # end
82
+ #
83
+ def additional_document_data(page, reader, pdf_id)
84
+ return {}
85
+ end
86
+
87
+ def update_page_document(pdf_id, text, additional_data)
88
+ end
89
+
90
+ def create_page_document(pdf_id, text, additional_data)
91
+ @els_client.create(
33
92
  index: 'pdf_pages',
34
93
  type: 'document',
35
- id: Digest::SHA256.digest(page.text),
94
+ id: combined_pdf_page_id(pdf_id, text),
36
95
  body: {
37
- text: page.text
38
- }
96
+ text: text
97
+ }.merge(additional_data)
39
98
  )
40
- rescue
99
+ end
100
+
101
+ def combined_pdf_page_id(pdf_id, text)
102
+ "#{pdf_id}-#{Digest::SHA256.hexdigest(text)}"
103
+ end
104
+
105
+
106
+ def reindex
107
+ @pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index|
108
+ additional_data = additional_document_data(page, reader, pdf_id)
109
+ begin
110
+ create_page_document(pdf_id, page.text, additional_data)
111
+ rescue Elasticsearch::Transport::Transport::Errors::Conflict => e
112
+ puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text)
113
+ }, already indexed"
41
114
  end
42
115
  end
43
116
  end
data/lib/pdf_search.rb CHANGED
@@ -25,31 +25,7 @@ module PdfSearch
25
25
  end
26
26
 
27
27
  def self.create_elasticsearch_index
28
- `curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
29
-
30
- ElasticSearchClient.indices.create(
31
- index: 'pdf_pages',
32
- body: {
33
- mappings: {
34
- document: {
35
- properties: {
36
- text: {
37
- type: 'text'
38
- }
39
- }
40
- }
41
- }
42
- }
43
- )
44
-
45
- return true
46
-
47
- rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
48
- if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
49
- return false
50
- else
51
- raise e
52
- end
28
+ PdfIndex.create_index
53
29
  end
54
30
 
55
31
  def self.wrap_elastic_request
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manuel Arno Korfmann
@@ -108,7 +108,8 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
- description: Run in any directory containing pdfs using `$ pdf_search`
111
+ description: 'Run in any directory containing pdfs using `$ pdf_search (optional:
112
+ <folder name>)`'
112
113
  email: manu@korfmann.info
113
114
  executables:
114
115
  - pdf_search