RubyGems - pdf_search - Versions diffs - 0.0.6 → 0.0.7 - Mend

pdf_search 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3d33439a4266cb68c08fcf67484f09ecc7ac39228387a8e8b0a54758f1569c84
-  data.tar.gz: 84f39eb8a771abaf45d5b5d38a1342065ba562a664afa2b8e99dbab1f651624a
+  metadata.gz: 869c6d11db7398dec2c8786b44fafbfa40d42da81606628afaee57182aeab9da
+  data.tar.gz: af4981100c81a4c83843f7b7132674847df4fc36295bed81fedf12e3e81571eb
 SHA512:
-  metadata.gz: 84064889972907aa6c5a81ca08a88be0acf43de33ada6bf3d2dad8c44b90dc2e856554de64b9ab44ea29f62dce0e1e670c335a3f8f283de017460cf6702bf372
-  data.tar.gz: 3f0396b7533dfe0c8d48abd95a96cbafd9066fcffa937b8d4a71f2fab420f49d33a69ecef5004a6346effd94403ec1584588d660c0709b57cf2e1eeb434c8933
+  metadata.gz: 7da27cf671e6f08d639b3acd8038c3c083472489e87fd222c282ca29208699faf2aa7e029687d974f0a2839f1ca51a77e5d1a6721a115b2d0f71958a108afa3e
+  data.tar.gz: 2d894baedc3aede56943a9933c6d7ca127d846a6e80f5ae1b1b69ce75bf2043056a32b529accae1495b9a707a0fbd0a5db62e015baf88be21d41d518d700f5e6

data/bin/pdf_search CHANGED Viewed

@@ -3,9 +3,23 @@ require 'pdf_search'
 require 'webrick'
-PdfSearch.create_elasticsearch_index
 PdfSearch.start_webserver
-PdfSearch::PdfIndex.start_daemon(ARGV[0])
+if ARGV[1]
+  require File.expand_path(ARGV[1])
+  if ARGV[2] == "" || ARGV[2].nil?
+    raise ArgumentError.new("Constant name of custom indexer must be provided as third argument")
+  end
+  custom_index = eval(ARGV[2])
+  custom_index.create_index
+  custom_index.start_daemon(ARGV[0])
+else
+  PdfSearch.create_elasticsearch_index
+  PdfSearch::PdfIndex.start_daemon(ARGV[0])
+end

data/lib/pdf_dir.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 #encoding: UTF-8
 require 'pdf-reader'
+require 'digest'
 module PdfSearch
   class PdfDir
@@ -23,7 +24,7 @@ module PdfSearch
 	  Enumerator.new do |e|
 		pdf_documents.each do |pdf_reader|
 			pdf_reader.pages.each do |page|
-				e << page
+				e << [page, pdf_reader, Digest::SHA256.hexdigest(Marshal.dump(pdf_reader.info))]
 			end
 		end
 	  end

data/lib/pdf_index.rb CHANGED Viewed

@@ -4,10 +4,43 @@ require 'pdf_dir'
 require 'digest'
 class PdfSearch::PdfIndex
 	attr_accessor :daemon
+  def self.create_index
+		::PdfSearch::ElasticSearchClient.indices.create(
+			index: 'pdf_pages',
+			body: {
+				mappings: {
+					document: {
+						properties: {
+							text: {
+								type: 'text'
+							}
+						}.merge(@properties ||= {})
+					}
+				}
+			}
+		)
+		return true
+	rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
+		if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
+			return false
+		else
+			raise e
+		end
+  end
+  def self.property(property_name, type)
+    @properties ||= {}
+    @properties[property_name] = {type: type}
+  end
 	def initialize(pdf_dir)
 		@pdf_dir = pdf_dir
 		@els_client = ::PdfSearch::ElasticSearchClient
 	end
 	def self.start_daemon(dir)
 	  pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
     if ENV['DEBUG_PDF_INDEXING']
@@ -17,27 +50,67 @@ class PdfSearch::PdfIndex
     end
 	  pdf_index
 	end
   def index_loop
 	  loop do
 		  self.reindex
 	  end
   end
 	def pid
      daemon.pid.pid
 	end
-	def reindex
-		@pdf_dir.pages.each.with_index do |page, index|
-      begin
-        sleep 0.5
-			  @els_client.index(
+  # additional_document_data can be overridden by your custom Index
+  #
+  # class CustomIndex < PdfSearch::PdfIndex
+  #   # The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:
+  #
+  #           # Name of property   TYPE
+  #   property :organisation_id, 'string' # or 'text' etc.
+  #
+  #   def get_organisation_id
+  #     # ...
+  #   end
+  #
+  #   def additional_document_data(page, reader, pdf_id)
+  #    return {
+  #      organisation_id: get_organisation_id(pdf_id, page)
+  #    }
+  #   end
+  # end
+  #
+  def additional_document_data(page, reader, pdf_id)
+    return {}
+  end
+  def update_page_document(pdf_id, text, additional_data)
+  end
+  def create_page_document(pdf_id, text, additional_data)
+	  @els_client.create(
 			  	index: 'pdf_pages',
 			  	type: 'document',
-			  	id: Digest::SHA256.digest(page.text),
+          id: combined_pdf_page_id(pdf_id, text),
 			  	body: {
-			  		text: page.text
-			  	}
+			  		text: text
+			  	}.merge(additional_data)
 			  )
-      rescue
+  end
+  def combined_pdf_page_id(pdf_id, text)
+    "#{pdf_id}-#{Digest::SHA256.hexdigest(text)}"
+  end
+	def reindex
+		@pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index|
+      additional_data = additional_document_data(page, reader, pdf_id)
+      begin
+        create_page_document(pdf_id, page.text, additional_data)
+		  rescue Elasticsearch::Transport::Transport::Errors::Conflict => e
+        puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text)
+}, already indexed"
       end
 		end
 	end

data/lib/pdf_search.rb CHANGED Viewed

@@ -25,31 +25,7 @@ module PdfSearch
 	end
 	def self.create_elasticsearch_index
-		`curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'`
-		ElasticSearchClient.indices.create(
-			index: 'pdf_pages',
-			body: {
-				mappings: {
-					document: {
-						properties: {
-							text: {
-								type: 'text'
-							}
-						}
-					}
-				}
-			}
-		)
-		return true
-	rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
-		if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
-			return false
-		else
-			raise e
-		end
+    PdfIndex.create_index
 	end
   def self.wrap_elastic_request

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf_search
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.0.7
 platform: ruby
 authors:
 - Manuel Arno Korfmann
@@ -108,7 +108,8 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-description: Run in any directory containing pdfs using `$ pdf_search`
+description: 'Run in any directory containing pdfs using `$ pdf_search (optional:
+  <folder name>)`'
 email: manu@korfmann.info
 executables:
 - pdf_search