pdfbox_text_extraction 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 037d71d25f199a1239bc1af7baff641840ab9f4a
4
- data.tar.gz: 51ba95d37d5aac68439a348f0c139ab648759760
3
+ metadata.gz: 68496b6265347fcbd44fca03f10d0f5f45565b83
4
+ data.tar.gz: be3723f8439ef4c6a461cf148dda26c7c32e5a4d
5
5
  SHA512:
6
- metadata.gz: 10fb9b69f45e7568d2508de9f21987d51d2fd0476a1acc2e4799f588fc16d396a374730db43e2e01734d9b2d4edc44efca502043ec2e71dd88bfc4497b084555
7
- data.tar.gz: fde63fedb74600ec80978af3c81f63f135304ff284e75772f953bd6a1e6f2030b0d9668a421647b5497823558100cb6e7e04badacae284f78e61ef1aa972a3ca
6
+ metadata.gz: 31acd912221c54f20fbab2a7ec657b18fa2f60264b0227d130433294f3b3365f20a80b5c9cc38758d20a19667dbdc8570d2478980f9e107c153a6239f2e0277d
7
+ data.tar.gz: badd9068e8d424c2b0b055968c734cc70828b0114b94d65b2e5b13d72748c50ff7fdf3e377843139c8685b7ce3a3b4ff3cb03573bb5243e549ff8304b48c7bad
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ### 1.0.2
2
+
3
+ * Added specs
4
+ * Refactorings and bug fixes
5
+
1
6
  ### 1.0.1
2
7
 
3
8
  * Fixed file name
@@ -31,18 +31,12 @@ class PdfboxTextExtraction
31
31
  # @param option [Float] crop_width crop area width
32
32
  # @param option [Float] crop_height crop area height
33
33
  # @return [String] the extracted text
34
- def self.run(path_to_pdf, options)
35
- extract_text(path_to_pdf, options)
36
- end
37
-
38
- # Extracts text
39
- # @see #run
40
- def self.extract_text(pdf_filepath, options)
41
- file = File.new(pdf_filepath)
34
+ def self.run(path_to_pdf, options={})
35
+ file = File.new(path_to_pdf)
42
36
  pd_doc = PDDocument.load(file)
43
37
  text_stripper = nil
44
38
  all_text = ''
45
- if %i[crop_x crop_y crop_width crop_height].any? { |e| options[e] }
39
+ if [:crop_x, :crop_y, :crop_width, :crop_height].any? { |e| options[e] }
46
40
  # crop options given, extract from crop area only
47
41
  res = 72
48
42
  body_text_rect = Rectangle2D::Float.new(
@@ -74,7 +68,7 @@ class PdfboxTextExtraction
74
68
 
75
69
  # Sets params on text_stripper.
76
70
  # @param text_stripper [PDFTextStripper]
77
- def configure_text_extraction_params(text_stripper)
71
+ def self.configure_text_extraction_params(text_stripper)
78
72
 
79
73
  # *****************************************************
80
74
  # Extraction thresholds and tolerances
@@ -1,3 +1,3 @@
1
1
  class PdfboxTextExtraction
2
- VERSION = "1.0.1"
2
+ VERSION = "1.0.2"
3
3
  end
@@ -0,0 +1,39 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require_relative './spec_helper'
4
+
5
+ describe PdfboxTextExtraction do
6
+
7
+ describe ".run" do
8
+
9
+ let(:pdf_file_path) { File.expand_path("../test_file.pdf", __FILE__) }
10
+
11
+ it "extracts full page text" do
12
+ extracted_text = PdfboxTextExtraction.run(pdf_file_path)
13
+ extracted_text.must_equal(
14
+ [
15
+ 'This is a test pdf for the pdfbox_text_extraction Ruby gem.',
16
+ 'Text at the top of the page.',
17
+ 'Text in the middle of the page.',
18
+ 'Text at the bottom of the page.',
19
+ '',
20
+ ].join("\n")
21
+ )
22
+ end
23
+
24
+ it "extracts crop area text" do
25
+ extracted_text = PdfboxTextExtraction.run(
26
+ pdf_file_path,
27
+ {
28
+ crop_x: 0,
29
+ crop_y: 3.0,
30
+ crop_width: 8.5,
31
+ crop_height: 6.0,
32
+ }
33
+ )
34
+ extracted_text.must_equal("Text in the middle of the page.\n\n")
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,5 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'bundler/setup'
4
+ require 'minitest/autorun'
5
+ require 'pdfbox_text_extraction'
Binary file
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdfbox_text_extraction
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jo Hund
@@ -69,6 +69,10 @@ files:
69
69
  - lib/pdfbox_text_extraction.rb
70
70
  - lib/pdfbox_text_extraction/version.rb
71
71
  - pdfbox_text_extraction.gemspec
72
+ - spec/pdfbox_text_extraction_spec.rb
73
+ - spec/spec_helper.rb
74
+ - spec/test_file.odt
75
+ - spec/test_file.pdf
72
76
  - vendor/pdfbox/commons-logging-1.2/LICENSE.txt
73
77
  - vendor/pdfbox/commons-logging-1.2/NOTICE.txt
74
78
  - vendor/pdfbox/commons-logging-1.2/RELEASE-NOTES.txt
@@ -100,4 +104,8 @@ rubygems_version: 2.4.8
100
104
  signing_key:
101
105
  specification_version: 4
102
106
  summary: Extract plain text from PDF documents.
103
- test_files: []
107
+ test_files:
108
+ - spec/pdfbox_text_extraction_spec.rb
109
+ - spec/spec_helper.rb
110
+ - spec/test_file.odt
111
+ - spec/test_file.pdf