pdf-extract 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,320 @@
1
+ require 'matrix'
2
+
3
+ require_relative '../font_metrics'
4
+
5
+ module PdfExtract
6
+ module Characters
7
+
8
+ # TODO Implement writing mode and :FontMatrix.
9
+
10
+ def self.glyph_descent c, state
11
+ if state.last[:font_metrics].nil? || state.last[:font_metrics].descent.nil?
12
+ 0
13
+ else
14
+ state.last[:font_metrics].descent / 1000.0
15
+ end
16
+ end
17
+
18
+ def self.glyph_ascent c, state
19
+ if state.last[:font_metrics].nil? || state.last[:font_metrics].ascent.nil?
20
+ 0
21
+ else
22
+ state.last[:font_metrics].ascent / 1000.0
23
+ end
24
+ end
25
+
26
+ def self.glyph_width c, state
27
+ # :Widths may be used to determine glyph width. This is the same as
28
+ # horizontal displacemnt.
29
+ glyph_displacement(c, state)[0]
30
+ end
31
+
32
+ def self.glyph_height c, state
33
+ # :Ascent and :Descent from the :FontDescriptor can be used to determine
34
+ # maximum glyph height.
35
+ glyph_ascent(c, state) - glyph_descent(c, state)
36
+ end
37
+
38
+ def self.glyph_displacement c, state
39
+ # For non-Type3 fonts, vertical displacement is the glyph width,
40
+ # horizontal displacement is always 0. Note glyph width is given
41
+ # in 1000ths of text units.
42
+ if state.last[:font_metrics].nil?
43
+ # XXX Why are some font resources not reported via resource_font?
44
+ # Bug in pdf-reader? Possibly because of :Font entry in graphics
45
+ # state set.
46
+ [ 0, 0 ]
47
+ else
48
+ [ state.last[:font_metrics].glyph_width(c) / 1000.0, 0 ]
49
+ end
50
+ end
51
+
52
+ def self.make_text_runs text, tj, state, render_state, page, page_number
53
+ # TODO Ignore chars outside the page :MediaBox.
54
+ # TODO Mul UserUnit if specified by page.
55
+ # TODO Include writing mode, so that runs can be joined either
56
+ # virtically or horizontally in the join stage.
57
+
58
+ objs = []
59
+ h_scale_mod = (state.last[:h_scale] / 100.0)
60
+ s = state.last
61
+
62
+ disp_x, disp_y = [0, 0]
63
+ spacing = 0
64
+ tx = ((disp_x - (tj / 1000.0)) * s[:font_size] + spacing) * h_scale_mod
65
+ ty = (disp_y - (tj / 1000.0)) * s[:font_size] + spacing
66
+
67
+ # TODO Should use either tx or ty depending on writing mode.
68
+ render_state[:tm] = Matrix[ [1, 0, 0], [0, 1, 0], [tx, 0, 1] ] * render_state[:tm]
69
+
70
+ # tj applies only to the first char of the Tj op.
71
+ tj = 0
72
+
73
+ text.each_char do |c|
74
+ trm = Matrix[ [s[:font_size] * h_scale_mod, 0, 0],
75
+ [0, s[:font_size], 0],
76
+ [0, s[:rise], 1] ]
77
+ trm = trm * render_state[:tm] * state.last[:ctm]
78
+
79
+ bl_pos = Matrix.rows( [ [0, glyph_descent(c, state), 1] ])
80
+ bl_pos = bl_pos * trm
81
+
82
+ width = glyph_width(c, state)
83
+ height = glyph_descent(c, state) + glyph_height(c, state)
84
+
85
+ tr_pos = Matrix.rows([ [width, height, 1] ])
86
+ tr_pos = tr_pos * trm
87
+
88
+ px = bl_pos.row(0)[0]
89
+ py = bl_pos.row(0)[1]
90
+
91
+ objs << {
92
+ :x => px,
93
+ :y => py,
94
+ :width => tr_pos.row(0)[0] - px,
95
+ :height => tr_pos.row(0)[1] - py,
96
+ :line_height => tr_pos.row(0)[1] - py,
97
+ :content => state.last[:font].to_utf8(c),
98
+ :page => page_number,
99
+ :font => state.last[:font].basefont,
100
+ :page_width => page[:MediaBox][2] - page[:MediaBox][0],
101
+ :page_height => page[:MediaBox][3] - page[:MediaBox][1]
102
+ }
103
+
104
+ disp_x, disp_y = glyph_displacement(c, state)
105
+ spacing = s[:char_spacing] if c != ' '
106
+ spacing = s[:word_spacing] if c == ' '
107
+ tx = ((disp_x - (tj / 1000.0)) * s[:font_size] + spacing) * h_scale_mod
108
+ ty = (disp_y - (tj / 1000.0)) * s[:font_size] + spacing
109
+
110
+ # TODO Should use either tx or ty depending on writing mode.
111
+ render_state[:tm] = Matrix[ [1, 0, 0], [0, 1, 0], [tx, 0, 1] ] * render_state[:tm]
112
+ end
113
+
114
+ objs
115
+ end
116
+
117
+ def self.include_in pdf
118
+
119
+ pdf.spatials :characters do |parser|
120
+ state = []
121
+ page = nil
122
+ fonts = {}
123
+ font_metrics = {}
124
+ page_n = 0
125
+ render_state = {
126
+ :tm => Matrix.identity(3),
127
+ :tlm => Matrix.identity(3)
128
+ }
129
+
130
+ parser.for :resource_font do |data|
131
+ fonts[data[0]] = data[1]
132
+ font_metrics[data[0]] = FontMetrics.new data[1]
133
+ nil
134
+ end
135
+
136
+ parser.for :begin_page do |data|
137
+ page = data[0]
138
+ page_n = page_n.next
139
+ state << {
140
+ :h_scale => 100,
141
+ :char_spacing => 0,
142
+ :word_spacing => 0,
143
+ :leading => 0,
144
+ :rise => 0,
145
+ :font => nil,
146
+ :font_metrics => nil,
147
+ :font_size => 0,
148
+ :ctm => Matrix.identity(3)
149
+ }
150
+ nil
151
+ end
152
+
153
+ parser.for :end_page do |data|
154
+ state.pop
155
+ nil
156
+ end
157
+
158
+ parser.for :begin_text_object do |data|
159
+ render_state = {
160
+ :tm => Matrix.identity(3),
161
+ :tlm => Matrix.identity(3)
162
+ }
163
+ nil
164
+ end
165
+
166
+ # Graphics state operators.
167
+
168
+ parser.for :set_graphics_state_parameters do |data|
169
+ # TODO Handle gs graphics state dictionary set operation for
170
+ # :Font dictionary entries. Probably why font is sometimes nil.
171
+ # puts data
172
+ nil
173
+ end
174
+
175
+ parser.for :save_graphics_state do |data|
176
+ state.push state.last.dup
177
+ nil
178
+ end
179
+
180
+ parser.for :restore_graphics_state do |data|
181
+ state.pop
182
+ nil
183
+ end
184
+
185
+ parser.for :concatenate_matrix do |data|
186
+ a, b, c, d, e, f = data
187
+ ctm = state.last[:ctm]
188
+ state.last[:ctm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ] * ctm
189
+ nil
190
+ end
191
+
192
+ # State change operators.
193
+
194
+ parser.for :set_text_leading do |data|
195
+ state.last[:leading] = data.first
196
+ nil
197
+ end
198
+
199
+ parser.for :set_text_rise do |data|
200
+ state.last[:rise] = data.first
201
+ nil
202
+ end
203
+
204
+ parser.for :set_character_spacing do |data|
205
+ state.last[:char_spacing] = data.first
206
+ nil
207
+ end
208
+
209
+ parser.for :set_word_spacing do |data|
210
+ state.last[:word_spacing] = data.first
211
+ nil
212
+ end
213
+
214
+ parser.for :set_horizontal_text_scaling do |data|
215
+ state.last[:h_scale] = data.first
216
+ nil
217
+ end
218
+
219
+ # Position change operators.
220
+
221
+ parser.for :move_text_position do |data|
222
+ render_state[:tm] = Matrix[
223
+ [1, 0, 0], [0, 1, 0], [data[0], data[1], 1]
224
+ ] * render_state[:tlm]
225
+ render_state[:tlm] = render_state[:tm]
226
+ nil
227
+ end
228
+
229
+ parser.for :move_text_position_and_set_leading do |data|
230
+ state.last[:leading] = -data[1]
231
+ render_state[:tm] = Matrix[
232
+ [1, 0, 0], [0, 1, 0], [data[0], data[1], 1]
233
+ ] * render_state[:tlm]
234
+ render_state[:tlm] = render_state[:tm]
235
+ nil
236
+ end
237
+
238
+ # Font change operators.
239
+
240
+ parser.for :set_text_font_and_size do |data|
241
+ state.last[:font] = fonts[data[0]]
242
+ state.last[:font_metrics] = font_metrics[data[0]]
243
+ state.last[:font_size] = data[1]
244
+ nil
245
+ end
246
+
247
+ # Text matrix change operators.
248
+
249
+ parser.for :set_text_matrix_and_text_line_matrix do |data|
250
+ # -- --
251
+ # | a b 0 |
252
+ # | c d 0 |
253
+ # | e f 1 |
254
+ # -- --
255
+ a, b, c, d, e, f = data
256
+ render_state[:tm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ]
257
+ render_state[:tlm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ]
258
+ nil
259
+ end
260
+
261
+ # New line operators.
262
+
263
+ parser.for :move_to_start_of_next_line do |data|
264
+ render_state[:tm] = Matrix[
265
+ [1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
266
+ ] * render_state[:tlm]
267
+ render_state[:tlm] = render_state[:tm]
268
+ nil
269
+ end
270
+
271
+ # Show text operators.
272
+
273
+ parser.for :set_spacing_next_line_show_text_raw do |data|
274
+ state.last[:word_spacing] = data[0]
275
+ state.last[:char_spacing] = data[1]
276
+
277
+ render_state[:tm] = Matrix[
278
+ [1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
279
+ ] * render_state[:tlm]
280
+ render_state[:tlm] = render_state[:tm]
281
+
282
+ make_text_runs data[2], 0, state, render_state, page, page_n
283
+ end
284
+
285
+ parser.for :move_to_next_line_and_show_text_raw do |data|
286
+ render_state[:tm] = Matrix[
287
+ [1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
288
+ ] * render_state[:tlm]
289
+ render_state[:tlm] = render_state[:tm]
290
+
291
+ make_text_runs data.first, 0, state, render_state, page, page_n
292
+ end
293
+
294
+ parser.for :show_text_raw do |data|
295
+ make_text_runs data.first, 0, state, render_state, page, page_n
296
+ end
297
+
298
+ parser.for :show_text_with_positioning_raw do |data|
299
+ data = data.first
300
+ runs = []
301
+ tj = 0
302
+
303
+ data.each do |item|
304
+ case item.class.to_s
305
+ when "Fixnum", "Float"
306
+ tj = item
307
+ when "String"
308
+ runs << make_text_runs(item, tj, state, render_state, page, page_n)
309
+ tj = 0
310
+ end
311
+ end
312
+
313
+ runs.flatten
314
+ end
315
+
316
+ end
317
+ end
318
+
319
+ end
320
+ end
@@ -0,0 +1,103 @@
1
+ require_relative '../spatial'
2
+
3
+ module PdfExtract
4
+ module Chunks
5
+
6
+ # TODO Look for obj[:writing_mode] == :vertical or :horizontal
7
+
8
+ Settings.default :char_slop, 0.2
9
+ Settings.default :word_slop, 4.0
10
+ Settings.default :overlap_slop, 0.9
11
+
12
+ def self.include_in pdf
13
+ char_slop = 0.2
14
+ word_slop = 4.0
15
+ overlap_slop = 0.9
16
+
17
+ pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
18
+ rows = {}
19
+
20
+ parser.before do
21
+ rows = {}
22
+ end
23
+
24
+ parser.objects :characters do |chars|
25
+ y = chars[:y]
26
+ rows[y] = [] if rows[y].nil?
27
+
28
+ idx = rows[y].index { |obj| chars[:x] <= obj[:x] }
29
+ if idx.nil?
30
+ rows[y] << chars.dup
31
+ else
32
+ rows[y].insert idx, chars.dup
33
+ end
34
+ end
35
+
36
+ parser.after do
37
+ char_slop = pdf.settings[:char_slop]
38
+ word_slop = pdf.settings[:word_slop]
39
+ overlap_slop = pdf.settings[:overlap_slop]
40
+
41
+ text_chunks = []
42
+
43
+ rows.each_pair do |y, row|
44
+ char_width = row.first[:width]
45
+
46
+ while row.length > 1
47
+ left = row.first
48
+ right = row[1]
49
+
50
+ if (left[:x] + left[:width] + (char_width * char_slop)) >= right[:x]
51
+ # join as adjacent chars
52
+ row[0] = Spatial.merge left, right
53
+ row.delete_at 1
54
+ char_width = right[:width] unless right[:content].strip =~ /[^A-Za-z0-9]/
55
+ elsif (left[:x] + left[:width] + (char_width * word_slop)) >= right[:x]
56
+ # join with a ' ' in the middle.
57
+ row[0] = Spatial.merge left, right, :separator => ' '
58
+ row.delete_at 1
59
+ char_width = right[:width] unless right[:content].strip =~ /[^A-Za-z0-9]/
60
+ else
61
+ # leave 'em be.
62
+ text_chunks << left
63
+ row.delete_at 0
64
+ char_width = row.first[:width]
65
+ end
66
+ end
67
+
68
+ text_chunks << row.first
69
+ end
70
+
71
+ # Merge chunks that have slightly different :y positions but which
72
+ # mostly overlap in the y dimension.
73
+
74
+ text_chunks.sort_by! { |obj| obj[:x] }
75
+ merged_text_chunks = []
76
+
77
+ while text_chunks.count > 1
78
+ left = text_chunks.first
79
+ right = text_chunks[1]
80
+
81
+ overlap = [left[:height], right[:height]].min - (left[:y] - right[:y]).abs
82
+ overlap = overlap / [left[:height], right[:height]].min
83
+
84
+ if overlap >= overlap_slop
85
+ # TODO follow char / word slop rules.
86
+ # join
87
+ text_chunks[0] = Spatial.merge left, right
88
+ text_chunks.delete_at 1
89
+ else
90
+ # no join
91
+ merged_text_chunks << text_chunks.first
92
+ text_chunks.delete_at 0
93
+ end
94
+ end
95
+
96
+ merged_text_chunks << text_chunks.first
97
+ end
98
+ end
99
+ end
100
+
101
+ end
102
+ end
103
+
@@ -0,0 +1,112 @@
1
+ require_relative '../spatial'
2
+
3
+ module PdfExtract
4
+ module Regions
5
+
6
+ Settings.default :line_slop, 1.0
7
+
8
+ # TODO Handle :writing_mode once present in characters and text_chunks.
9
+
10
+ def self.incident l, r
11
+ lx1 = l[:x]
12
+ lx2 = l[:x] + l[:width]
13
+ rx1 = r[:x]
14
+ rx2 = r[:x] + r[:width]
15
+
16
+ lr = (lx1..lx2)
17
+ rr = (rx1..rx2)
18
+
19
+ lr.include? rx1 or lr.include? rx2 or rr.include? lx1 or rr.include? lx2
20
+ end
21
+
22
+ def self.append_line_offsets region
23
+ region[:lines] ||= []
24
+ region[:lines].each do |line|
25
+ line[:x_offset] = line[:x] - region[:x]
26
+ line[:y_offset] = line[:y] - region[:y]
27
+ end
28
+ end
29
+
30
+ def self.append_line_spacing region
31
+ region[:lines] ||= []
32
+ height_taken = 0
33
+ region[:lines].each do |line|
34
+ from_top = region[:height] - (line[:y_offset] + line[:height])
35
+ line[:spacing] = from_top - height_taken
36
+ height_taken = from_top + line[:height]
37
+ end
38
+ end
39
+
40
+ def self.include_in pdf
41
+ pdf.spatials :regions, :paged => true, :depends_on => [:chunks] do |parser|
42
+ chunks = []
43
+ regions = []
44
+
45
+ parser.before do
46
+ chunks = []
47
+ regions = []
48
+ end
49
+
50
+ parser.objects :chunks do |chunk|
51
+ y = chunk[:y].floor
52
+
53
+ idx = chunks.index { |obj| chunk[:y] <= obj[:y] }
54
+ if idx.nil?
55
+ chunks << chunk.dup
56
+ else
57
+ chunks.insert idx, chunk.dup
58
+ end
59
+ end
60
+
61
+ # TODO Rewrite to use Spatial::collapse so that text is in proper
62
+ # order.
63
+
64
+ parser.after do
65
+ # Convert chunks to have line content.
66
+ chunks.each do |chunk|
67
+ chunk[:lines] = [Spatial.as_line(chunk)]
68
+ chunk.delete :content
69
+ end
70
+
71
+ compare_index = 1
72
+ while chunks.count > compare_index
73
+ b = chunks.first
74
+ t = chunks[compare_index]
75
+
76
+ line_height = b[:line_height]
77
+ line_slop = [line_height, t[:height]].min * pdf.settings[:line_slop]
78
+ incident_y = (b[:y] + b[:height] + line_slop) >= t[:y]
79
+
80
+ if incident_y && incident(t, b)
81
+ chunks[0] = Spatial.merge t, b, :lines => true
82
+ chunks.delete_at compare_index
83
+ compare_index = 1
84
+ elsif compare_index < chunks.count - 1
85
+ # Could be more chunks within range.
86
+ compare_index = compare_index.next
87
+ else
88
+ # Finished region.
89
+ regions << chunks.first
90
+ chunks.delete_at 0
91
+ compare_index = 1
92
+ end
93
+ end
94
+
95
+ regions << chunks.first unless chunks.first.nil?
96
+
97
+ regions.each do |region|
98
+ append_line_offsets region
99
+ append_line_spacing region
100
+
101
+ region[:lines].map! do |line|
102
+ Spatial.drop_spatial line
103
+ end
104
+ end
105
+
106
+ regions.sort_by { |obj| -obj[:y] }
107
+ end
108
+ end
109
+ end
110
+
111
+ end
112
+ end
@@ -0,0 +1,69 @@
1
+
2
+ module PdfExtract
3
+ class MultiRange
4
+
5
+ attr_accessor :ranges
6
+
7
+ def initialize
8
+ @ranges = []
9
+ end
10
+
11
+ def append range
12
+ return if range.max.nil? || range.min.nil?
13
+
14
+ incident = @ranges.select do |r|
15
+ r.include?(range.min) || r.include?(range.max) ||
16
+ range.include?(r.min) || range.include?(r.max)
17
+ end
18
+
19
+ incident << range
20
+
21
+ non_incident = @ranges - incident
22
+
23
+ non_incident << (incident.collect { |r| r.min }.min .. incident.collect { |r| r.max }.max)
24
+ @ranges = non_incident
25
+
26
+ @max_excluded = nil
27
+ @min_excluded = nil
28
+ @max = nil
29
+ @min = nil
30
+ end
31
+
32
+ def max_excluded
33
+ if @max_excluded.nil?
34
+ @max_excluded = @ranges.first.max if count == 1
35
+ @max_excluded = @ranges.sort_by { |r| -r.min }.first.min unless count == 1
36
+ end
37
+ @max_excluded
38
+ end
39
+
40
+ def min_excluded
41
+ if @min_excluded.nil?
42
+ @min_excluded = @ranges.first.min if count == 1
43
+ @min_excluded = @ranges.sort_by { |r| r.max }.first.max unless count == 1
44
+ end
45
+ @min_excluded
46
+ end
47
+
48
+ def max
49
+ @max ||= @ranges.sort_by { |r| -r.max }.first.max
50
+ end
51
+
52
+ def min
53
+ @min ||= @ranges.sort_by { |r| r.min }.first.min
54
+ end
55
+
56
+ def avg
57
+ @ranges.reduce(0) { |sum, r| sum += (r.max - r.min) } / @ranges.count.to_f
58
+ end
59
+
60
+ def covered
61
+ @ranges.reduce(0) { |total, r| total += (r.max - r.min) }
62
+ end
63
+
64
+ def count
65
+ @ranges.count
66
+ end
67
+
68
+ end
69
+ end
data/lib/names.rb ADDED
@@ -0,0 +1,85 @@
1
+ require "net/http"
2
+ require "json"
3
+ require "sqlite3"
4
+
5
+ require_relative "pdf-extract"
6
+
7
+ module PdfExtract::Names
8
+
9
+ class NamesDatabase
10
+ @@ambiguous_weighting = 0.1
11
+ @@unambiguous_weighting = 1.0
12
+
13
+ def self.path_to_data data_filename
14
+ File.join(File.dirname(File.expand_path(__FILE__)), "../data/" + data_filename)
15
+ end
16
+
17
+ @@db = SQLite3::Database.new(path_to_data("familynames.db"), {:readonly => true})
18
+ @@stop_words = File.open(path_to_data("stopwords.txt")).read.split(",")
19
+
20
+ def self.detect_names content
21
+ words = content.split
22
+ sum = 0.0
23
+
24
+ words.each do |word|
25
+ word = word.downcase
26
+
27
+ if not @@stop_words.include? word && word.length > 1
28
+ query_word = word.capitalize.gsub(/-(.)/) { |s|
29
+ "-" + s[1].capitalize
30
+ }
31
+
32
+ @@db.execute("select * from names where name = ?", query_word) do |row|
33
+ if row[2] == 1
34
+ sum += @@ambiguous_weighting
35
+ else
36
+ sum += @@unambiguous_weighting
37
+ end
38
+ end
39
+ end
40
+
41
+ end
42
+
43
+ if sum == 0
44
+ {:name_frequency => 0}
45
+ else
46
+ {:name_frequency => (sum / words.length.to_f)}
47
+ end
48
+ end
49
+ end
50
+
51
+ class NamesService
52
+ def self.detect_names content
53
+ data = {:name_frequency => 0.0}
54
+ begin
55
+ response = Net::HTTP.start "names.crrd.dyndns.org" do |http|
56
+ http.post "/detect", content
57
+ end
58
+
59
+ if response.code == "200"
60
+ data = JSON.parse response.body
61
+ end
62
+ rescue
63
+ end
64
+ data
65
+ end
66
+ end
67
+
68
+ class NoDetection
69
+ def self.detect_names content
70
+ {:name_frequency => 0.0}
71
+ end
72
+ end
73
+
74
+ @@detector = NamesDatabase
75
+
76
+ def self.detector= detector_class
77
+ @@detector = detector_class
78
+ end
79
+
80
+ def self.detect_names content
81
+ @@detector.detect_names content
82
+ end
83
+
84
+ end
85
+