pdfbox_text_extraction 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 68496b6265347fcbd44fca03f10d0f5f45565b83
4
- data.tar.gz: be3723f8439ef4c6a461cf148dda26c7c32e5a4d
3
+ metadata.gz: 70c791a4fe2ab12583748f72ba913773ee783f13
4
+ data.tar.gz: 38d02df12fdd6cb6a9ced823a9177620d3ddd837
5
5
  SHA512:
6
- metadata.gz: 31acd912221c54f20fbab2a7ec657b18fa2f60264b0227d130433294f3b3365f20a80b5c9cc38758d20a19667dbdc8570d2478980f9e107c153a6239f2e0277d
7
- data.tar.gz: badd9068e8d424c2b0b055968c734cc70828b0114b94d65b2e5b13d72748c50ff7fdf3e377843139c8685b7ce3a3b4ff3cb03573bb5243e549ff8304b48c7bad
6
+ metadata.gz: a4b55b665c756ce6f706bedb871bc9ddb143a4c8b4929efff62715f6032937ce9202c565e556fbae81aa6ddaef53afd3a5bae8a72dfe6f5078140cfceca3b770
7
+ data.tar.gz: c15b82ece24ba36a172d94451f3e39bfaf1d3bb3b0226ab42eb9e172d28618f2b415196b9ca0fa8a5d6241a2f87e6495dc51698b513f9aad6dbc75d7bf6aacd0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 1.1.0
2
+
3
+ * Allow overriding of extraction params
4
+
1
5
  ### 1.0.2
2
6
 
3
7
  * Added specs
@@ -26,10 +26,20 @@ class PdfboxTextExtraction
26
26
  #
27
27
  # @param path_to_pdf [String]
28
28
  # @param options [Hash, optional]
29
- # @param option [Float] crop_x crop area top left corner x-coordinate
30
- # @param option [Float] crop_y crop area top left corner y-coordinate
31
- # @param option [Float] crop_width crop area width
32
- # @param option [Float] crop_height crop area height
29
+ # @option options [Float] crop_x crop area top left corner x-coordinate
30
+ # @option options [Float] crop_y crop area top left corner y-coordinate
31
+ # @option options [Float] crop_width crop area width
32
+ # @option options [Float] crop_height crop area height
33
+ # @option options [Float] average_char_tolerance
34
+ # @option options [Float] drop_threshold
35
+ # @option options [Float] indent_threshold
36
+ # @option options [Float] spacing_tolerance
37
+ # @option options [Boolean] sort_by_position
38
+ # @option options [String] line_separator
39
+ # @option options [String] page_end
40
+ # @option options [String] page_start
41
+ # @option options [String] paragraph_end
42
+ # @option options [String] paragraph_start
33
43
  # @return [String] the extracted text
34
44
  def self.run(path_to_pdf, options={})
35
45
  file = File.new(path_to_pdf)
@@ -47,7 +57,7 @@ class PdfboxTextExtraction
47
57
  )
48
58
  text_stripper = PDFTextStripperByArea.new
49
59
  text_stripper.addRegion("bodyText", body_text_rect)
50
- configure_text_extraction_params(text_stripper)
60
+ configure_text_extraction_params(text_stripper, options)
51
61
 
52
62
  pd_doc.getPages.each do |page|
53
63
  text_stripper.extractRegions(page)
@@ -57,7 +67,7 @@ class PdfboxTextExtraction
57
67
  else
58
68
  # No crop options given, extract all text
59
69
  text_stripper = PDFTextStripper.new
60
- configure_text_extraction_params(text_stripper)
70
+ configure_text_extraction_params(text_stripper, options)
61
71
  all_text << text_stripper.getText(pd_doc)
62
72
  end
63
73
 
@@ -68,7 +78,7 @@ class PdfboxTextExtraction
68
78
 
69
79
  # Sets params on text_stripper.
70
80
  # @param text_stripper [PDFTextStripper]
71
- def self.configure_text_extraction_params(text_stripper)
81
+ def self.configure_text_extraction_params(text_stripper, options)
72
82
 
73
83
  # *****************************************************
74
84
  # Extraction thresholds and tolerances
@@ -76,25 +86,33 @@ class PdfboxTextExtraction
76
86
  # Set the character width-based tolerance value that is used to estimate
77
87
  # where spaces in text should be added.
78
88
  # Default: 0.30000001192092896
79
- # text_stripper.setAverageCharTolerance(0.30000001192092896)
89
+ if(o = options[:average_char_tolerance])
90
+ text_stripper.setAverageCharTolerance(o)
91
+ end
80
92
 
81
93
  # Set the minimum whitespace, as a multiple of the max height of the current
82
94
  # characters beyond which the current line start is considered to be a
83
95
  # paragraph start.
84
96
  # Default: 2.5
85
- # text_stripper.setDropThreshold(2.5)
97
+ if(o = options[:drop_threshold])
98
+ text_stripper.setDropThreshold(o)
99
+ end
86
100
 
87
101
  # Set the multiple of whitespace character widths for the current text
88
102
  # which the current line start can be indented from the previous line
89
103
  # start beyond which the current line start is considered to be a
90
104
  # paragraph start.
91
105
  # Default: 2.0
92
- # text_stripper.setIndentThreshold(2.0)
106
+ if(o = options[:indent_threshold])
107
+ text_stripper.setIndentThreshold(o)
108
+ end
93
109
 
94
110
  # Set the space width-based tolerance value that is used to estimate where
95
111
  # spaces in text should be added.
96
112
  # Default: 0.5
97
- text_stripper.setSpacingTolerance(0.3)
113
+ if(o = options[:spacing_tolerance])
114
+ text_stripper.setSpacingTolerance(o)
115
+ end
98
116
 
99
117
  # *****************************************************
100
118
  # Sort order
@@ -102,30 +120,42 @@ class PdfboxTextExtraction
102
120
  # The order of the text tokens in a PDF file may not be in the same as
103
121
  # they appear visually on the screen.
104
122
  # Default: false
105
- text_stripper.setSortByPosition(false)
123
+ if !(o = options[:sort_by_position]).nil? # Allow override of false
124
+ text_stripper.setSortByPosition(o)
125
+ end
106
126
 
107
127
  # *****************************************************
108
128
  # Separator tokens
109
129
 
110
130
  # Set the desired line separator for output text.
111
131
  # Default: "\n"
112
- # text_stripper.setLineSeparator("\n")
132
+ if(o = options[:line_separator])
133
+ text_stripper.setLineSeparator(o)
134
+ end
113
135
 
114
136
  # Set the string which will be used at the end of a page.
115
137
  # Default: ""
116
- # text_stripper.setPageEnd("<<page-end>>")
138
+ if(o = options[:page_end])
139
+ text_stripper.setPageEnd(o)
140
+ end
117
141
 
118
142
  # Set the string which will be used at the end of a page.
119
143
  # Default: ""
120
- # text_stripper.setPageStart("<<page-start>>")
144
+ if(o = options[:page_start])
145
+ text_stripper.setPageStart(o)
146
+ end
121
147
 
122
148
  # Set the string which will be used at the end of a paragraph.
123
149
  # Default: ""
124
- # text_stripper.setParagraphEnd("<<paragraph-end>>")
150
+ if(o = options[:paragraph_end])
151
+ text_stripper.setParagraphEnd(o)
152
+ end
125
153
 
126
154
  # Set the string which will be used at the end of a paragraph.
127
155
  # Default: ""
128
- # text_stripper.setParagraphStart("<<paragraph-start>>")
156
+ if(o = options[:paragraph_start])
157
+ text_stripper.setParagraphStart(o)
158
+ end
129
159
 
130
160
  end
131
161
 
@@ -1,3 +1,3 @@
1
1
  class PdfboxTextExtraction
2
- VERSION = "1.0.2"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdfbox_text_extraction
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jo Hund
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-19 00:00:00.000000000 Z
11
+ date: 2016-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement