RubyGems - pdfbox_text_extraction - Versions diffs - 1.0.2 → 1.1.0 - Mend

pdfbox_text_extraction 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/pdfbox_text_extraction.rb +47 -17
data/lib/pdfbox_text_extraction/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 68496b6265347fcbd44fca03f10d0f5f45565b83
-  data.tar.gz: be3723f8439ef4c6a461cf148dda26c7c32e5a4d
+  metadata.gz: 70c791a4fe2ab12583748f72ba913773ee783f13
+  data.tar.gz: 38d02df12fdd6cb6a9ced823a9177620d3ddd837
 SHA512:
-  metadata.gz: 31acd912221c54f20fbab2a7ec657b18fa2f60264b0227d130433294f3b3365f20a80b5c9cc38758d20a19667dbdc8570d2478980f9e107c153a6239f2e0277d
-  data.tar.gz: badd9068e8d424c2b0b055968c734cc70828b0114b94d65b2e5b13d72748c50ff7fdf3e377843139c8685b7ce3a3b4ff3cb03573bb5243e549ff8304b48c7bad
+  metadata.gz: a4b55b665c756ce6f706bedb871bc9ddb143a4c8b4929efff62715f6032937ce9202c565e556fbae81aa6ddaef53afd3a5bae8a72dfe6f5078140cfceca3b770
+  data.tar.gz: c15b82ece24ba36a172d94451f3e39bfaf1d3bb3b0226ab42eb9e172d28618f2b415196b9ca0fa8a5d6241a2f87e6495dc51698b513f9aad6dbc75d7bf6aacd0

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,7 @@
+## 1.1.0
+* Allow overriding of extraction params
 ### 1.0.2
 * Added specs

data/lib/pdfbox_text_extraction.rb CHANGED Viewed

@@ -26,10 +26,20 @@ class PdfboxTextExtraction
   #
   # @param path_to_pdf [String]
   # @param options [Hash, optional]
-  # @param option [Float] crop_x crop area top left corner x-coordinate
-  # @param option [Float] crop_y crop area top left corner y-coordinate
-  # @param option [Float] crop_width crop area width
-  # @param option [Float] crop_height crop area height
+  # @option options [Float] crop_x crop area top left corner x-coordinate
+  # @option options [Float] crop_y crop area top left corner y-coordinate
+  # @option options [Float] crop_width crop area width
+  # @option options [Float] crop_height crop area height
+  # @option options [Float] average_char_tolerance
+  # @option options [Float] drop_threshold
+  # @option options [Float] indent_threshold
+  # @option options [Float] spacing_tolerance
+  # @option options [Boolean] sort_by_position
+  # @option options [String] line_separator
+  # @option options [String] page_end
+  # @option options [String] page_start
+  # @option options [String] paragraph_end
+  # @option options [String] paragraph_start
   # @return [String] the extracted text
   def self.run(path_to_pdf, options={})
     file = File.new(path_to_pdf)
@@ -47,7 +57,7 @@ class PdfboxTextExtraction
       )
       text_stripper = PDFTextStripperByArea.new
       text_stripper.addRegion("bodyText", body_text_rect)
-      configure_text_extraction_params(text_stripper)
+      configure_text_extraction_params(text_stripper, options)
       pd_doc.getPages.each do |page|
         text_stripper.extractRegions(page)
@@ -57,7 +67,7 @@ class PdfboxTextExtraction
     else
       # No crop options given, extract all text
       text_stripper = PDFTextStripper.new
-      configure_text_extraction_params(text_stripper)
+      configure_text_extraction_params(text_stripper, options)
       all_text << text_stripper.getText(pd_doc)
     end
@@ -68,7 +78,7 @@ class PdfboxTextExtraction
   # Sets params on text_stripper.
   # @param text_stripper [PDFTextStripper]
-  def self.configure_text_extraction_params(text_stripper)
+  def self.configure_text_extraction_params(text_stripper, options)
     # *****************************************************
     # Extraction thresholds and tolerances
@@ -76,25 +86,33 @@ class PdfboxTextExtraction
     # Set the character width-based tolerance value that is used to estimate
     # where spaces in text should be added.
     # Default: 0.30000001192092896
-    # text_stripper.setAverageCharTolerance(0.30000001192092896)
+    if(o = options[:average_char_tolerance])
+      text_stripper.setAverageCharTolerance(o)
+    end
     # Set the minimum whitespace, as a multiple of the max height of the current
     # characters beyond which the current line start is considered to be a
     # paragraph start.
     # Default: 2.5
-    # text_stripper.setDropThreshold(2.5)
+    if(o = options[:drop_threshold])
+      text_stripper.setDropThreshold(o)
+    end
     # Set the multiple of whitespace character widths for the current text
     # which the current line start can be indented from the previous line
     # start beyond which the current line start is considered to be a
     # paragraph start.
     # Default: 2.0
-    # text_stripper.setIndentThreshold(2.0)
+    if(o = options[:indent_threshold])
+      text_stripper.setIndentThreshold(o)
+    end
     # Set the space width-based tolerance value that is used to estimate where
     # spaces in text should be added.
     # Default: 0.5
-    text_stripper.setSpacingTolerance(0.3)
+    if(o = options[:spacing_tolerance])
+      text_stripper.setSpacingTolerance(o)
+    end
     # *****************************************************
     # Sort order
@@ -102,30 +120,42 @@ class PdfboxTextExtraction
     # The order of the text tokens in a PDF file may not be in the same as
     # they appear visually on the screen.
     # Default: false
-    text_stripper.setSortByPosition(false)
+    if !(o = options[:sort_by_position]).nil? # Allow override of false
+      text_stripper.setSortByPosition(o)
+    end
     # *****************************************************
     # Separator tokens
     # Set the desired line separator for output text.
     # Default: "\n"
-    # text_stripper.setLineSeparator("\n")
+    if(o = options[:line_separator])
+      text_stripper.setLineSeparator(o)
+    end
     # Set the string which will be used at the end of a page.
     # Default: ""
-    # text_stripper.setPageEnd("<<page-end>>")
+    if(o = options[:page_end])
+      text_stripper.setPageEnd(o)
+    end
     # Set the string which will be used at the end of a page.
     # Default: ""
-    # text_stripper.setPageStart("<<page-start>>")
+    if(o = options[:page_start])
+      text_stripper.setPageStart(o)
+    end
     # Set the string which will be used at the end of a paragraph.
     # Default: ""
-    # text_stripper.setParagraphEnd("<<paragraph-end>>")
+    if(o = options[:paragraph_end])
+      text_stripper.setParagraphEnd(o)
+    end
     # Set the string which will be used at the end of a paragraph.
     # Default: ""
-    # text_stripper.setParagraphStart("<<paragraph-start>>")
+    if(o = options[:paragraph_start])
+      text_stripper.setParagraphStart(o)
+    end
   end

data/lib/pdfbox_text_extraction/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class PdfboxTextExtraction
-  VERSION = "1.0.2"
+  VERSION = "1.1.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pdfbox_text_extraction
 version: !ruby/object:Gem::Version
-  version: 1.0.2
+  version: 1.1.0
 platform: ruby
 authors:
 - Jo Hund
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-03-19 00:00:00.000000000 Z
+date: 2016-04-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement