pdfbox_text_extraction 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 68496b6265347fcbd44fca03f10d0f5f45565b83
4
- data.tar.gz: be3723f8439ef4c6a461cf148dda26c7c32e5a4d
3
+ metadata.gz: 70c791a4fe2ab12583748f72ba913773ee783f13
4
+ data.tar.gz: 38d02df12fdd6cb6a9ced823a9177620d3ddd837
5
5
  SHA512:
6
- metadata.gz: 31acd912221c54f20fbab2a7ec657b18fa2f60264b0227d130433294f3b3365f20a80b5c9cc38758d20a19667dbdc8570d2478980f9e107c153a6239f2e0277d
7
- data.tar.gz: badd9068e8d424c2b0b055968c734cc70828b0114b94d65b2e5b13d72748c50ff7fdf3e377843139c8685b7ce3a3b4ff3cb03573bb5243e549ff8304b48c7bad
6
+ metadata.gz: a4b55b665c756ce6f706bedb871bc9ddb143a4c8b4929efff62715f6032937ce9202c565e556fbae81aa6ddaef53afd3a5bae8a72dfe6f5078140cfceca3b770
7
+ data.tar.gz: c15b82ece24ba36a172d94451f3e39bfaf1d3bb3b0226ab42eb9e172d28618f2b415196b9ca0fa8a5d6241a2f87e6495dc51698b513f9aad6dbc75d7bf6aacd0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 1.1.0
2
+
3
+ * Allow overriding of extraction params
4
+
1
5
  ### 1.0.2
2
6
 
3
7
  * Added specs
@@ -26,10 +26,20 @@ class PdfboxTextExtraction
26
26
  #
27
27
  # @param path_to_pdf [String]
28
28
  # @param options [Hash, optional]
29
- # @param option [Float] crop_x crop area top left corner x-coordinate
30
- # @param option [Float] crop_y crop area top left corner y-coordinate
31
- # @param option [Float] crop_width crop area width
32
- # @param option [Float] crop_height crop area height
29
+ # @option options [Float] crop_x crop area top left corner x-coordinate
30
+ # @option options [Float] crop_y crop area top left corner y-coordinate
31
+ # @option options [Float] crop_width crop area width
32
+ # @option options [Float] crop_height crop area height
33
+ # @option options [Float] average_char_tolerance
34
+ # @option options [Float] drop_threshold
35
+ # @option options [Float] indent_threshold
36
+ # @option options [Float] spacing_tolerance
37
+ # @option options [Boolean] sort_by_position
38
+ # @option options [String] line_separator
39
+ # @option options [String] page_end
40
+ # @option options [String] page_start
41
+ # @option options [String] paragraph_end
42
+ # @option options [String] paragraph_start
33
43
  # @return [String] the extracted text
34
44
  def self.run(path_to_pdf, options={})
35
45
  file = File.new(path_to_pdf)
@@ -47,7 +57,7 @@ class PdfboxTextExtraction
47
57
  )
48
58
  text_stripper = PDFTextStripperByArea.new
49
59
  text_stripper.addRegion("bodyText", body_text_rect)
50
- configure_text_extraction_params(text_stripper)
60
+ configure_text_extraction_params(text_stripper, options)
51
61
 
52
62
  pd_doc.getPages.each do |page|
53
63
  text_stripper.extractRegions(page)
@@ -57,7 +67,7 @@ class PdfboxTextExtraction
57
67
  else
58
68
  # No crop options given, extract all text
59
69
  text_stripper = PDFTextStripper.new
60
- configure_text_extraction_params(text_stripper)
70
+ configure_text_extraction_params(text_stripper, options)
61
71
  all_text << text_stripper.getText(pd_doc)
62
72
  end
63
73
 
@@ -68,7 +78,7 @@ class PdfboxTextExtraction
68
78
 
69
79
  # Sets params on text_stripper.
70
80
  # @param text_stripper [PDFTextStripper]
71
- def self.configure_text_extraction_params(text_stripper)
81
+ def self.configure_text_extraction_params(text_stripper, options)
72
82
 
73
83
  # *****************************************************
74
84
  # Extraction thresholds and tolerances
@@ -76,25 +86,33 @@ class PdfboxTextExtraction
76
86
  # Set the character width-based tolerance value that is used to estimate
77
87
  # where spaces in text should be added.
78
88
  # Default: 0.30000001192092896
79
- # text_stripper.setAverageCharTolerance(0.30000001192092896)
89
+ if(o = options[:average_char_tolerance])
90
+ text_stripper.setAverageCharTolerance(o)
91
+ end
80
92
 
81
93
  # Set the minimum whitespace, as a multiple of the max height of the current
82
94
  # characters beyond which the current line start is considered to be a
83
95
  # paragraph start.
84
96
  # Default: 2.5
85
- # text_stripper.setDropThreshold(2.5)
97
+ if(o = options[:drop_threshold])
98
+ text_stripper.setDropThreshold(o)
99
+ end
86
100
 
87
101
  # Set the multiple of whitespace character widths for the current text
88
102
  # which the current line start can be indented from the previous line
89
103
  # start beyond which the current line start is considered to be a
90
104
  # paragraph start.
91
105
  # Default: 2.0
92
- # text_stripper.setIndentThreshold(2.0)
106
+ if(o = options[:indent_threshold])
107
+ text_stripper.setIndentThreshold(o)
108
+ end
93
109
 
94
110
  # Set the space width-based tolerance value that is used to estimate where
95
111
  # spaces in text should be added.
96
112
  # Default: 0.5
97
- text_stripper.setSpacingTolerance(0.3)
113
+ if(o = options[:spacing_tolerance])
114
+ text_stripper.setSpacingTolerance(o)
115
+ end
98
116
 
99
117
  # *****************************************************
100
118
  # Sort order
@@ -102,30 +120,42 @@ class PdfboxTextExtraction
102
120
  # The order of the text tokens in a PDF file may not be in the same as
103
121
  # they appear visually on the screen.
104
122
  # Default: false
105
- text_stripper.setSortByPosition(false)
123
+ if !(o = options[:sort_by_position]).nil? # Allow override of false
124
+ text_stripper.setSortByPosition(o)
125
+ end
106
126
 
107
127
  # *****************************************************
108
128
  # Separator tokens
109
129
 
110
130
  # Set the desired line separator for output text.
111
131
  # Default: "\n"
112
- # text_stripper.setLineSeparator("\n")
132
+ if(o = options[:line_separator])
133
+ text_stripper.setLineSeparator(o)
134
+ end
113
135
 
114
136
  # Set the string which will be used at the end of a page.
115
137
  # Default: ""
116
- # text_stripper.setPageEnd("<<page-end>>")
138
+ if(o = options[:page_end])
139
+ text_stripper.setPageEnd(o)
140
+ end
117
141
 
118
142
  # Set the string which will be used at the end of a page.
119
143
  # Default: ""
120
- # text_stripper.setPageStart("<<page-start>>")
144
+ if(o = options[:page_start])
145
+ text_stripper.setPageStart(o)
146
+ end
121
147
 
122
148
  # Set the string which will be used at the end of a paragraph.
123
149
  # Default: ""
124
- # text_stripper.setParagraphEnd("<<paragraph-end>>")
150
+ if(o = options[:paragraph_end])
151
+ text_stripper.setParagraphEnd(o)
152
+ end
125
153
 
126
154
  # Set the string which will be used at the end of a paragraph.
127
155
  # Default: ""
128
- # text_stripper.setParagraphStart("<<paragraph-start>>")
156
+ if(o = options[:paragraph_start])
157
+ text_stripper.setParagraphStart(o)
158
+ end
129
159
 
130
160
  end
131
161
 
@@ -1,3 +1,3 @@
1
1
  class PdfboxTextExtraction
2
- VERSION = "1.0.2"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdfbox_text_extraction
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jo Hund
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-19 00:00:00.000000000 Z
11
+ date: 2016-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement