pdfbox_text_extraction 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/pdfbox_text_extraction.rb +47 -17
- data/lib/pdfbox_text_extraction/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 70c791a4fe2ab12583748f72ba913773ee783f13
|
4
|
+
data.tar.gz: 38d02df12fdd6cb6a9ced823a9177620d3ddd837
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4b55b665c756ce6f706bedb871bc9ddb143a4c8b4929efff62715f6032937ce9202c565e556fbae81aa6ddaef53afd3a5bae8a72dfe6f5078140cfceca3b770
|
7
|
+
data.tar.gz: c15b82ece24ba36a172d94451f3e39bfaf1d3bb3b0226ab42eb9e172d28618f2b415196b9ca0fa8a5d6241a2f87e6495dc51698b513f9aad6dbc75d7bf6aacd0
|
data/CHANGELOG.md
CHANGED
@@ -26,10 +26,20 @@ class PdfboxTextExtraction
|
|
26
26
|
#
|
27
27
|
# @param path_to_pdf [String]
|
28
28
|
# @param options [Hash, optional]
|
29
|
-
# @
|
30
|
-
# @
|
31
|
-
# @
|
32
|
-
# @
|
29
|
+
# @option options [Float] crop_x crop area top left corner x-coordinate
|
30
|
+
# @option options [Float] crop_y crop area top left corner y-coordinate
|
31
|
+
# @option options [Float] crop_width crop area width
|
32
|
+
# @option options [Float] crop_height crop area height
|
33
|
+
# @option options [Float] average_char_tolerance
|
34
|
+
# @option options [Float] drop_threshold
|
35
|
+
# @option options [Float] indent_threshold
|
36
|
+
# @option options [Float] spacing_tolerance
|
37
|
+
# @option options [Boolean] sort_by_position
|
38
|
+
# @option options [String] line_separator
|
39
|
+
# @option options [String] page_end
|
40
|
+
# @option options [String] page_start
|
41
|
+
# @option options [String] paragraph_end
|
42
|
+
# @option options [String] paragraph_start
|
33
43
|
# @return [String] the extracted text
|
34
44
|
def self.run(path_to_pdf, options={})
|
35
45
|
file = File.new(path_to_pdf)
|
@@ -47,7 +57,7 @@ class PdfboxTextExtraction
|
|
47
57
|
)
|
48
58
|
text_stripper = PDFTextStripperByArea.new
|
49
59
|
text_stripper.addRegion("bodyText", body_text_rect)
|
50
|
-
configure_text_extraction_params(text_stripper)
|
60
|
+
configure_text_extraction_params(text_stripper, options)
|
51
61
|
|
52
62
|
pd_doc.getPages.each do |page|
|
53
63
|
text_stripper.extractRegions(page)
|
@@ -57,7 +67,7 @@ class PdfboxTextExtraction
|
|
57
67
|
else
|
58
68
|
# No crop options given, extract all text
|
59
69
|
text_stripper = PDFTextStripper.new
|
60
|
-
configure_text_extraction_params(text_stripper)
|
70
|
+
configure_text_extraction_params(text_stripper, options)
|
61
71
|
all_text << text_stripper.getText(pd_doc)
|
62
72
|
end
|
63
73
|
|
@@ -68,7 +78,7 @@ class PdfboxTextExtraction
|
|
68
78
|
|
69
79
|
# Sets params on text_stripper.
|
70
80
|
# @param text_stripper [PDFTextStripper]
|
71
|
-
def self.configure_text_extraction_params(text_stripper)
|
81
|
+
def self.configure_text_extraction_params(text_stripper, options)
|
72
82
|
|
73
83
|
# *****************************************************
|
74
84
|
# Extraction thresholds and tolerances
|
@@ -76,25 +86,33 @@ class PdfboxTextExtraction
|
|
76
86
|
# Set the character width-based tolerance value that is used to estimate
|
77
87
|
# where spaces in text should be added.
|
78
88
|
# Default: 0.30000001192092896
|
79
|
-
|
89
|
+
if(o = options[:average_char_tolerance])
|
90
|
+
text_stripper.setAverageCharTolerance(o)
|
91
|
+
end
|
80
92
|
|
81
93
|
# Set the minimum whitespace, as a multiple of the max height of the current
|
82
94
|
# characters beyond which the current line start is considered to be a
|
83
95
|
# paragraph start.
|
84
96
|
# Default: 2.5
|
85
|
-
|
97
|
+
if(o = options[:drop_threshold])
|
98
|
+
text_stripper.setDropThreshold(o)
|
99
|
+
end
|
86
100
|
|
87
101
|
# Set the multiple of whitespace character widths for the current text
|
88
102
|
# which the current line start can be indented from the previous line
|
89
103
|
# start beyond which the current line start is considered to be a
|
90
104
|
# paragraph start.
|
91
105
|
# Default: 2.0
|
92
|
-
|
106
|
+
if(o = options[:indent_threshold])
|
107
|
+
text_stripper.setIndentThreshold(o)
|
108
|
+
end
|
93
109
|
|
94
110
|
# Set the space width-based tolerance value that is used to estimate where
|
95
111
|
# spaces in text should be added.
|
96
112
|
# Default: 0.5
|
97
|
-
|
113
|
+
if(o = options[:spacing_tolerance])
|
114
|
+
text_stripper.setSpacingTolerance(o)
|
115
|
+
end
|
98
116
|
|
99
117
|
# *****************************************************
|
100
118
|
# Sort order
|
@@ -102,30 +120,42 @@ class PdfboxTextExtraction
|
|
102
120
|
# The order of the text tokens in a PDF file may not be in the same as
|
103
121
|
# they appear visually on the screen.
|
104
122
|
# Default: false
|
105
|
-
|
123
|
+
if !(o = options[:sort_by_position]).nil? # Allow override of false
|
124
|
+
text_stripper.setSortByPosition(o)
|
125
|
+
end
|
106
126
|
|
107
127
|
# *****************************************************
|
108
128
|
# Separator tokens
|
109
129
|
|
110
130
|
# Set the desired line separator for output text.
|
111
131
|
# Default: "\n"
|
112
|
-
|
132
|
+
if(o = options[:line_separator])
|
133
|
+
text_stripper.setLineSeparator(o)
|
134
|
+
end
|
113
135
|
|
114
136
|
# Set the string which will be used at the end of a page.
|
115
137
|
# Default: ""
|
116
|
-
|
138
|
+
if(o = options[:page_end])
|
139
|
+
text_stripper.setPageEnd(o)
|
140
|
+
end
|
117
141
|
|
118
142
|
# Set the string which will be used at the end of a page.
|
119
143
|
# Default: ""
|
120
|
-
|
144
|
+
if(o = options[:page_start])
|
145
|
+
text_stripper.setPageStart(o)
|
146
|
+
end
|
121
147
|
|
122
148
|
# Set the string which will be used at the end of a paragraph.
|
123
149
|
# Default: ""
|
124
|
-
|
150
|
+
if(o = options[:paragraph_end])
|
151
|
+
text_stripper.setParagraphEnd(o)
|
152
|
+
end
|
125
153
|
|
126
154
|
# Set the string which will be used at the end of a paragraph.
|
127
155
|
# Default: ""
|
128
|
-
|
156
|
+
if(o = options[:paragraph_start])
|
157
|
+
text_stripper.setParagraphStart(o)
|
158
|
+
end
|
129
159
|
|
130
160
|
end
|
131
161
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdfbox_text_extraction
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jo Hund
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|