RubyGems - pdfbox_text_extraction - Versions diffs - 1.0.1 → 1.0.2 - Mend

pdfbox_text_extraction 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/lib/pdfbox_text_extraction.rb +4 -10
data/lib/pdfbox_text_extraction/version.rb +1 -1
data/spec/pdfbox_text_extraction_spec.rb +39 -0
data/spec/spec_helper.rb +5 -0
data/spec/test_file.odt +0 -0
data/spec/test_file.pdf +0 -0
metadata +10 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 037d71d25f199a1239bc1af7baff641840ab9f4a
-  data.tar.gz: 51ba95d37d5aac68439a348f0c139ab648759760
+  metadata.gz: 68496b6265347fcbd44fca03f10d0f5f45565b83
+  data.tar.gz: be3723f8439ef4c6a461cf148dda26c7c32e5a4d
 SHA512:
-  metadata.gz: 10fb9b69f45e7568d2508de9f21987d51d2fd0476a1acc2e4799f588fc16d396a374730db43e2e01734d9b2d4edc44efca502043ec2e71dd88bfc4497b084555
-  data.tar.gz: fde63fedb74600ec80978af3c81f63f135304ff284e75772f953bd6a1e6f2030b0d9668a421647b5497823558100cb6e7e04badacae284f78e61ef1aa972a3ca
+  metadata.gz: 31acd912221c54f20fbab2a7ec657b18fa2f60264b0227d130433294f3b3365f20a80b5c9cc38758d20a19667dbdc8570d2478980f9e107c153a6239f2e0277d
+  data.tar.gz: badd9068e8d424c2b0b055968c734cc70828b0114b94d65b2e5b13d72748c50ff7fdf3e377843139c8685b7ce3a3b4ff3cb03573bb5243e549ff8304b48c7bad

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,8 @@
+### 1.0.2
+* Added specs
+* Refactorings and bug fixes
 ### 1.0.1
 * Fixed file name

data/lib/pdfbox_text_extraction.rb CHANGED Viewed

@@ -31,18 +31,12 @@ class PdfboxTextExtraction
   # @param option [Float] crop_width crop area width
   # @param option [Float] crop_height crop area height
   # @return [String] the extracted text
-  def self.run(path_to_pdf, options)
-    extract_text(path_to_pdf, options)
-  end
-  # Extracts text
-  # @see #run
-  def self.extract_text(pdf_filepath, options)
-    file = File.new(pdf_filepath)
+  def self.run(path_to_pdf, options={})
+    file = File.new(path_to_pdf)
     pd_doc = PDDocument.load(file)
     text_stripper = nil
     all_text = ''
-    if %i[crop_x crop_y crop_width crop_height].any? { |e| options[e] }
+    if [:crop_x, :crop_y, :crop_width, :crop_height].any? { |e| options[e] }
       # crop options given, extract from crop area only
       res = 72
       body_text_rect = Rectangle2D::Float.new(
@@ -74,7 +68,7 @@ class PdfboxTextExtraction
   # Sets params on text_stripper.
   # @param text_stripper [PDFTextStripper]
-  def configure_text_extraction_params(text_stripper)
+  def self.configure_text_extraction_params(text_stripper)
     # *****************************************************
     # Extraction thresholds and tolerances

data/lib/pdfbox_text_extraction/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class PdfboxTextExtraction
-  VERSION = "1.0.1"
+  VERSION = "1.0.2"
 end

data/spec/pdfbox_text_extraction_spec.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+require_relative './spec_helper'
+describe PdfboxTextExtraction do
+  describe ".run" do
+    let(:pdf_file_path) { File.expand_path("../test_file.pdf", __FILE__) }
+    it "extracts full page text" do
+      extracted_text = PdfboxTextExtraction.run(pdf_file_path)
+      extracted_text.must_equal(
+        [
+          'This is a test pdf for the pdfbox_text_extraction Ruby gem.',
+          'Text at the top of the page.',
+          'Text in the middle of the page.',
+          'Text at the bottom of the page.',
+          '',
+        ].join("\n")
+      )
+    end
+    it "extracts crop area text" do
+      extracted_text = PdfboxTextExtraction.run(
+        pdf_file_path,
+        {
+          crop_x: 0,
+          crop_y: 3.0,
+          crop_width: 8.5,
+          crop_height: 6.0,
+        }
+      )
+      extracted_text.must_equal("Text in the middle of the page.\n\n")
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+require 'bundler/setup'
+require 'minitest/autorun'
+require 'pdfbox_text_extraction'

data/spec/test_file.odt ADDED Viewed

Binary file

data/spec/test_file.pdf ADDED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdfbox_text_extraction
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.2
 platform: ruby
 authors:
 - Jo Hund
@@ -69,6 +69,10 @@ files:
 - lib/pdfbox_text_extraction.rb
 - lib/pdfbox_text_extraction/version.rb
 - pdfbox_text_extraction.gemspec
+- spec/pdfbox_text_extraction_spec.rb
+- spec/spec_helper.rb
+- spec/test_file.odt
+- spec/test_file.pdf
 - vendor/pdfbox/commons-logging-1.2/LICENSE.txt
 - vendor/pdfbox/commons-logging-1.2/NOTICE.txt
 - vendor/pdfbox/commons-logging-1.2/RELEASE-NOTES.txt
@@ -100,4 +104,8 @@ rubygems_version: 2.4.8
 signing_key:
 specification_version: 4
 summary: Extract plain text from PDF documents.
-test_files: []
+test_files:
+- spec/pdfbox_text_extraction_spec.rb
+- spec/spec_helper.rb
+- spec/test_file.odt
+- spec/test_file.pdf