pdfbox_text_extraction 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/lib/pdfbox_text_extraction.rb +4 -10
- data/lib/pdfbox_text_extraction/version.rb +1 -1
- data/spec/pdfbox_text_extraction_spec.rb +39 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/test_file.odt +0 -0
- data/spec/test_file.pdf +0 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 68496b6265347fcbd44fca03f10d0f5f45565b83
|
4
|
+
data.tar.gz: be3723f8439ef4c6a461cf148dda26c7c32e5a4d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 31acd912221c54f20fbab2a7ec657b18fa2f60264b0227d130433294f3b3365f20a80b5c9cc38758d20a19667dbdc8570d2478980f9e107c153a6239f2e0277d
|
7
|
+
data.tar.gz: badd9068e8d424c2b0b055968c734cc70828b0114b94d65b2e5b13d72748c50ff7fdf3e377843139c8685b7ce3a3b4ff3cb03573bb5243e549ff8304b48c7bad
|
data/CHANGELOG.md
CHANGED
@@ -31,18 +31,12 @@ class PdfboxTextExtraction
|
|
31
31
|
# @param option [Float] crop_width crop area width
|
32
32
|
# @param option [Float] crop_height crop area height
|
33
33
|
# @return [String] the extracted text
|
34
|
-
def self.run(path_to_pdf, options)
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
# Extracts text
|
39
|
-
# @see #run
|
40
|
-
def self.extract_text(pdf_filepath, options)
|
41
|
-
file = File.new(pdf_filepath)
|
34
|
+
def self.run(path_to_pdf, options={})
|
35
|
+
file = File.new(path_to_pdf)
|
42
36
|
pd_doc = PDDocument.load(file)
|
43
37
|
text_stripper = nil
|
44
38
|
all_text = ''
|
45
|
-
if
|
39
|
+
if [:crop_x, :crop_y, :crop_width, :crop_height].any? { |e| options[e] }
|
46
40
|
# crop options given, extract from crop area only
|
47
41
|
res = 72
|
48
42
|
body_text_rect = Rectangle2D::Float.new(
|
@@ -74,7 +68,7 @@ class PdfboxTextExtraction
|
|
74
68
|
|
75
69
|
# Sets params on text_stripper.
|
76
70
|
# @param text_stripper [PDFTextStripper]
|
77
|
-
def configure_text_extraction_params(text_stripper)
|
71
|
+
def self.configure_text_extraction_params(text_stripper)
|
78
72
|
|
79
73
|
# *****************************************************
|
80
74
|
# Extraction thresholds and tolerances
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require_relative './spec_helper'
|
4
|
+
|
5
|
+
describe PdfboxTextExtraction do
|
6
|
+
|
7
|
+
describe ".run" do
|
8
|
+
|
9
|
+
let(:pdf_file_path) { File.expand_path("../test_file.pdf", __FILE__) }
|
10
|
+
|
11
|
+
it "extracts full page text" do
|
12
|
+
extracted_text = PdfboxTextExtraction.run(pdf_file_path)
|
13
|
+
extracted_text.must_equal(
|
14
|
+
[
|
15
|
+
'This is a test pdf for the pdfbox_text_extraction Ruby gem.',
|
16
|
+
'Text at the top of the page.',
|
17
|
+
'Text in the middle of the page.',
|
18
|
+
'Text at the bottom of the page.',
|
19
|
+
'',
|
20
|
+
].join("\n")
|
21
|
+
)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "extracts crop area text" do
|
25
|
+
extracted_text = PdfboxTextExtraction.run(
|
26
|
+
pdf_file_path,
|
27
|
+
{
|
28
|
+
crop_x: 0,
|
29
|
+
crop_y: 3.0,
|
30
|
+
crop_width: 8.5,
|
31
|
+
crop_height: 6.0,
|
32
|
+
}
|
33
|
+
)
|
34
|
+
extracted_text.must_equal("Text in the middle of the page.\n\n")
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/test_file.odt
ADDED
Binary file
|
data/spec/test_file.pdf
ADDED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdfbox_text_extraction
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jo Hund
|
@@ -69,6 +69,10 @@ files:
|
|
69
69
|
- lib/pdfbox_text_extraction.rb
|
70
70
|
- lib/pdfbox_text_extraction/version.rb
|
71
71
|
- pdfbox_text_extraction.gemspec
|
72
|
+
- spec/pdfbox_text_extraction_spec.rb
|
73
|
+
- spec/spec_helper.rb
|
74
|
+
- spec/test_file.odt
|
75
|
+
- spec/test_file.pdf
|
72
76
|
- vendor/pdfbox/commons-logging-1.2/LICENSE.txt
|
73
77
|
- vendor/pdfbox/commons-logging-1.2/NOTICE.txt
|
74
78
|
- vendor/pdfbox/commons-logging-1.2/RELEASE-NOTES.txt
|
@@ -100,4 +104,8 @@ rubygems_version: 2.4.8
|
|
100
104
|
signing_key:
|
101
105
|
specification_version: 4
|
102
106
|
summary: Extract plain text from PDF documents.
|
103
|
-
test_files:
|
107
|
+
test_files:
|
108
|
+
- spec/pdfbox_text_extraction_spec.rb
|
109
|
+
- spec/spec_helper.rb
|
110
|
+
- spec/test_file.odt
|
111
|
+
- spec/test_file.pdf
|