pdfbox_text_extraction 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/lib/pdfbox_text_extraction.rb +4 -10
- data/lib/pdfbox_text_extraction/version.rb +1 -1
- data/spec/pdfbox_text_extraction_spec.rb +39 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/test_file.odt +0 -0
- data/spec/test_file.pdf +0 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 68496b6265347fcbd44fca03f10d0f5f45565b83
|
4
|
+
data.tar.gz: be3723f8439ef4c6a461cf148dda26c7c32e5a4d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 31acd912221c54f20fbab2a7ec657b18fa2f60264b0227d130433294f3b3365f20a80b5c9cc38758d20a19667dbdc8570d2478980f9e107c153a6239f2e0277d
|
7
|
+
data.tar.gz: badd9068e8d424c2b0b055968c734cc70828b0114b94d65b2e5b13d72748c50ff7fdf3e377843139c8685b7ce3a3b4ff3cb03573bb5243e549ff8304b48c7bad
|
data/CHANGELOG.md
CHANGED
@@ -31,18 +31,12 @@ class PdfboxTextExtraction
|
|
31
31
|
# @param option [Float] crop_width crop area width
|
32
32
|
# @param option [Float] crop_height crop area height
|
33
33
|
# @return [String] the extracted text
|
34
|
-
def self.run(path_to_pdf, options)
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
# Extracts text
|
39
|
-
# @see #run
|
40
|
-
def self.extract_text(pdf_filepath, options)
|
41
|
-
file = File.new(pdf_filepath)
|
34
|
+
def self.run(path_to_pdf, options={})
|
35
|
+
file = File.new(path_to_pdf)
|
42
36
|
pd_doc = PDDocument.load(file)
|
43
37
|
text_stripper = nil
|
44
38
|
all_text = ''
|
45
|
-
if
|
39
|
+
if [:crop_x, :crop_y, :crop_width, :crop_height].any? { |e| options[e] }
|
46
40
|
# crop options given, extract from crop area only
|
47
41
|
res = 72
|
48
42
|
body_text_rect = Rectangle2D::Float.new(
|
@@ -74,7 +68,7 @@ class PdfboxTextExtraction
|
|
74
68
|
|
75
69
|
# Sets params on text_stripper.
|
76
70
|
# @param text_stripper [PDFTextStripper]
|
77
|
-
def configure_text_extraction_params(text_stripper)
|
71
|
+
def self.configure_text_extraction_params(text_stripper)
|
78
72
|
|
79
73
|
# *****************************************************
|
80
74
|
# Extraction thresholds and tolerances
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require_relative './spec_helper'
|
4
|
+
|
5
|
+
describe PdfboxTextExtraction do
|
6
|
+
|
7
|
+
describe ".run" do
|
8
|
+
|
9
|
+
let(:pdf_file_path) { File.expand_path("../test_file.pdf", __FILE__) }
|
10
|
+
|
11
|
+
it "extracts full page text" do
|
12
|
+
extracted_text = PdfboxTextExtraction.run(pdf_file_path)
|
13
|
+
extracted_text.must_equal(
|
14
|
+
[
|
15
|
+
'This is a test pdf for the pdfbox_text_extraction Ruby gem.',
|
16
|
+
'Text at the top of the page.',
|
17
|
+
'Text in the middle of the page.',
|
18
|
+
'Text at the bottom of the page.',
|
19
|
+
'',
|
20
|
+
].join("\n")
|
21
|
+
)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "extracts crop area text" do
|
25
|
+
extracted_text = PdfboxTextExtraction.run(
|
26
|
+
pdf_file_path,
|
27
|
+
{
|
28
|
+
crop_x: 0,
|
29
|
+
crop_y: 3.0,
|
30
|
+
crop_width: 8.5,
|
31
|
+
crop_height: 6.0,
|
32
|
+
}
|
33
|
+
)
|
34
|
+
extracted_text.must_equal("Text in the middle of the page.\n\n")
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/test_file.odt
ADDED
Binary file
|
data/spec/test_file.pdf
ADDED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdfbox_text_extraction
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jo Hund
|
@@ -69,6 +69,10 @@ files:
|
|
69
69
|
- lib/pdfbox_text_extraction.rb
|
70
70
|
- lib/pdfbox_text_extraction/version.rb
|
71
71
|
- pdfbox_text_extraction.gemspec
|
72
|
+
- spec/pdfbox_text_extraction_spec.rb
|
73
|
+
- spec/spec_helper.rb
|
74
|
+
- spec/test_file.odt
|
75
|
+
- spec/test_file.pdf
|
72
76
|
- vendor/pdfbox/commons-logging-1.2/LICENSE.txt
|
73
77
|
- vendor/pdfbox/commons-logging-1.2/NOTICE.txt
|
74
78
|
- vendor/pdfbox/commons-logging-1.2/RELEASE-NOTES.txt
|
@@ -100,4 +104,8 @@ rubygems_version: 2.4.8
|
|
100
104
|
signing_key:
|
101
105
|
specification_version: 4
|
102
106
|
summary: Extract plain text from PDF documents.
|
103
|
-
test_files:
|
107
|
+
test_files:
|
108
|
+
- spec/pdfbox_text_extraction_spec.rb
|
109
|
+
- spec/spec_helper.rb
|
110
|
+
- spec/test_file.odt
|
111
|
+
- spec/test_file.pdf
|