pdfbox_text_extraction 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 037d71d25f199a1239bc1af7baff641840ab9f4a
4
- data.tar.gz: 51ba95d37d5aac68439a348f0c139ab648759760
3
+ metadata.gz: 68496b6265347fcbd44fca03f10d0f5f45565b83
4
+ data.tar.gz: be3723f8439ef4c6a461cf148dda26c7c32e5a4d
5
5
  SHA512:
6
- metadata.gz: 10fb9b69f45e7568d2508de9f21987d51d2fd0476a1acc2e4799f588fc16d396a374730db43e2e01734d9b2d4edc44efca502043ec2e71dd88bfc4497b084555
7
- data.tar.gz: fde63fedb74600ec80978af3c81f63f135304ff284e75772f953bd6a1e6f2030b0d9668a421647b5497823558100cb6e7e04badacae284f78e61ef1aa972a3ca
6
+ metadata.gz: 31acd912221c54f20fbab2a7ec657b18fa2f60264b0227d130433294f3b3365f20a80b5c9cc38758d20a19667dbdc8570d2478980f9e107c153a6239f2e0277d
7
+ data.tar.gz: badd9068e8d424c2b0b055968c734cc70828b0114b94d65b2e5b13d72748c50ff7fdf3e377843139c8685b7ce3a3b4ff3cb03573bb5243e549ff8304b48c7bad
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ### 1.0.2
2
+
3
+ * Added specs
4
+ * Refactorings and bug fixes
5
+
1
6
  ### 1.0.1
2
7
 
3
8
  * Fixed file name
@@ -31,18 +31,12 @@ class PdfboxTextExtraction
31
31
  # @param option [Float] crop_width crop area width
32
32
  # @param option [Float] crop_height crop area height
33
33
  # @return [String] the extracted text
34
- def self.run(path_to_pdf, options)
35
- extract_text(path_to_pdf, options)
36
- end
37
-
38
- # Extracts text
39
- # @see #run
40
- def self.extract_text(pdf_filepath, options)
41
- file = File.new(pdf_filepath)
34
+ def self.run(path_to_pdf, options={})
35
+ file = File.new(path_to_pdf)
42
36
  pd_doc = PDDocument.load(file)
43
37
  text_stripper = nil
44
38
  all_text = ''
45
- if %i[crop_x crop_y crop_width crop_height].any? { |e| options[e] }
39
+ if [:crop_x, :crop_y, :crop_width, :crop_height].any? { |e| options[e] }
46
40
  # crop options given, extract from crop area only
47
41
  res = 72
48
42
  body_text_rect = Rectangle2D::Float.new(
@@ -74,7 +68,7 @@ class PdfboxTextExtraction
74
68
 
75
69
  # Sets params on text_stripper.
76
70
  # @param text_stripper [PDFTextStripper]
77
- def configure_text_extraction_params(text_stripper)
71
+ def self.configure_text_extraction_params(text_stripper)
78
72
 
79
73
  # *****************************************************
80
74
  # Extraction thresholds and tolerances
@@ -1,3 +1,3 @@
1
1
  class PdfboxTextExtraction
2
- VERSION = "1.0.1"
2
+ VERSION = "1.0.2"
3
3
  end
@@ -0,0 +1,39 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require_relative './spec_helper'
4
+
5
+ describe PdfboxTextExtraction do
6
+
7
+ describe ".run" do
8
+
9
+ let(:pdf_file_path) { File.expand_path("../test_file.pdf", __FILE__) }
10
+
11
+ it "extracts full page text" do
12
+ extracted_text = PdfboxTextExtraction.run(pdf_file_path)
13
+ extracted_text.must_equal(
14
+ [
15
+ 'This is a test pdf for the pdfbox_text_extraction Ruby gem.',
16
+ 'Text at the top of the page.',
17
+ 'Text in the middle of the page.',
18
+ 'Text at the bottom of the page.',
19
+ '',
20
+ ].join("\n")
21
+ )
22
+ end
23
+
24
+ it "extracts crop area text" do
25
+ extracted_text = PdfboxTextExtraction.run(
26
+ pdf_file_path,
27
+ {
28
+ crop_x: 0,
29
+ crop_y: 3.0,
30
+ crop_width: 8.5,
31
+ crop_height: 6.0,
32
+ }
33
+ )
34
+ extracted_text.must_equal("Text in the middle of the page.\n\n")
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,5 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'bundler/setup'
4
+ require 'minitest/autorun'
5
+ require 'pdfbox_text_extraction'
Binary file
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdfbox_text_extraction
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jo Hund
@@ -69,6 +69,10 @@ files:
69
69
  - lib/pdfbox_text_extraction.rb
70
70
  - lib/pdfbox_text_extraction/version.rb
71
71
  - pdfbox_text_extraction.gemspec
72
+ - spec/pdfbox_text_extraction_spec.rb
73
+ - spec/spec_helper.rb
74
+ - spec/test_file.odt
75
+ - spec/test_file.pdf
72
76
  - vendor/pdfbox/commons-logging-1.2/LICENSE.txt
73
77
  - vendor/pdfbox/commons-logging-1.2/NOTICE.txt
74
78
  - vendor/pdfbox/commons-logging-1.2/RELEASE-NOTES.txt
@@ -100,4 +104,8 @@ rubygems_version: 2.4.8
100
104
  signing_key:
101
105
  specification_version: 4
102
106
  summary: Extract plain text from PDF documents.
103
- test_files: []
107
+ test_files:
108
+ - spec/pdfbox_text_extraction_spec.rb
109
+ - spec/spec_helper.rb
110
+ - spec/test_file.odt
111
+ - spec/test_file.pdf