rpdf2txt 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
|
5
|
+
#
|
|
6
|
+
# This library is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
# License as published by the Free Software Foundation; either
|
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
# Lesser General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
# License along with this library; if not, write to the Free Software
|
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
#
|
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
|
22
|
+
#
|
|
23
|
+
# Text -- Rpdf2txt -- 28.11.2002 -- aschrafl@ywesee.com
|
|
24
|
+
|
|
25
|
+
require 'rpdf2txt/text_state'
|
|
26
|
+
require 'rpdf2txt/textparser'
|
|
27
|
+
require 'rpdf2txt/object'
|
|
28
|
+
require 'iconv'
|
|
29
|
+
|
|
30
|
+
module Rpdf2txt
|
|
31
|
+
class Text
|
|
32
|
+
attr_writer :current_page
|
|
33
|
+
attr_reader :text_state, :transformation_matrix
|
|
34
|
+
def initialize(src, target_encoding='utf8',
|
|
35
|
+
tm=Matrix[[1,0,0],[0,-1,0],[0,0,1]])
|
|
36
|
+
@src = src.gsub(/[\r\n]+/n, "\n")
|
|
37
|
+
@text_state = TextState.new(target_encoding)
|
|
38
|
+
@transformation_matrix = tm
|
|
39
|
+
@text_state.transformation_matrix = tm
|
|
40
|
+
end
|
|
41
|
+
## FIXME: generic_symbol_font is a workaround. Implement a way to
|
|
42
|
+
## pass unicode-snippets (or rework everything to unicode)
|
|
43
|
+
def generic_symbol_font(font)
|
|
44
|
+
if(font.nil?)
|
|
45
|
+
Font.new('<< /BaseFont /Symbol')
|
|
46
|
+
elsif(/symbol/in.match(font.basefont_name))
|
|
47
|
+
font
|
|
48
|
+
else
|
|
49
|
+
genfont = font.dup
|
|
50
|
+
genfont.attributes[:basefont] = 'Symbol'
|
|
51
|
+
genfont
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
def get_font(font_name)
|
|
55
|
+
return nil unless @current_page
|
|
56
|
+
@current_page.font(font_name.to_s.downcase.intern)
|
|
57
|
+
end
|
|
58
|
+
def mapped_ascii(ascii)
|
|
59
|
+
if(@current_font)
|
|
60
|
+
if((cmap = @current_font.cmap) && (map = cmap.map) \
|
|
61
|
+
&& (unicode_bytes = map[ascii]) \
|
|
62
|
+
&& (ascii = SymbolMap::SYMBOL_ENTITIES[unicode_bytes]))
|
|
63
|
+
ascii.chr
|
|
64
|
+
elsif((map = @current_font.to_unicode) \
|
|
65
|
+
&& (utf8 = map.to_utf8(ascii)))
|
|
66
|
+
@current_font.attributes[:encoding] = '/UTF8'
|
|
67
|
+
#@text_state.set_font(@current_font)
|
|
68
|
+
[utf8].pack('U')
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
def scan
|
|
73
|
+
@snippets = []
|
|
74
|
+
ast = Rpdf2txt.text_parser.parse(@src)
|
|
75
|
+
scan_tree(ast)
|
|
76
|
+
@snippets
|
|
77
|
+
rescue Exception
|
|
78
|
+
puts @src
|
|
79
|
+
raise
|
|
80
|
+
end
|
|
81
|
+
def scan_tree(ast)
|
|
82
|
+
ast.values.each { |node|
|
|
83
|
+
if(node.name == 'Array') \
|
|
84
|
+
&& (node.values.first.children_names.first == 'kerning')
|
|
85
|
+
## If the case [ 34 (foo) ] crops up, the first operation
|
|
86
|
+
## executed on @text_state is advance_x. This results in
|
|
87
|
+
## the width of the last text-snipped being calculated twice.
|
|
88
|
+
## This here is a workaround that resets the snippet to an
|
|
89
|
+
## empty string if we are encountering a [ ??? ] construct
|
|
90
|
+
## (an array).
|
|
91
|
+
## TODO: find a more general solution
|
|
92
|
+
@text_state.set_txt('')
|
|
93
|
+
end
|
|
94
|
+
node.children_names.each { |child_name|
|
|
95
|
+
case child_name
|
|
96
|
+
when 'alpha'
|
|
97
|
+
@text_state.tmalpha = node.alpha.value.to_f
|
|
98
|
+
when 'beta'
|
|
99
|
+
@text_state.tmbeta = -node.beta.value.to_f
|
|
100
|
+
skew = node.beta.value.to_f > 0.1
|
|
101
|
+
if(@current_font && @current_font.skewed != skew)
|
|
102
|
+
@current_font = @current_font.dup
|
|
103
|
+
@current_font.skewed = skew
|
|
104
|
+
@text_state.set_font(@current_font)
|
|
105
|
+
end
|
|
106
|
+
when 'xscale'
|
|
107
|
+
@text_state.set_xscale(node.xscale.value)
|
|
108
|
+
when 'yscale'
|
|
109
|
+
@text_state.set_yscale(node.yscale.value)
|
|
110
|
+
when 'charspace'
|
|
111
|
+
@text_state.set_char_spacing(node.charspace.value)
|
|
112
|
+
when 'kerning'
|
|
113
|
+
@text_state.advance_x(node.kerning.value.to_f)
|
|
114
|
+
when 'tdleadx'
|
|
115
|
+
@text_state.update_x(node.tdleadx.value.to_f)
|
|
116
|
+
when 'tdleady'
|
|
117
|
+
lead = node.tdleady.value.to_f
|
|
118
|
+
@text_state.set_lead(lead)
|
|
119
|
+
@text_state.update_y(lead)
|
|
120
|
+
when 'xpos'
|
|
121
|
+
@text_state.update_x(node.xpos.value.to_f)
|
|
122
|
+
when 'ypos'
|
|
123
|
+
@text_state.update_y(node.ypos.value.to_f)
|
|
124
|
+
when 'fontname'
|
|
125
|
+
@current_font = get_font(node.fontname.value)
|
|
126
|
+
@text_state.set_font(@current_font)
|
|
127
|
+
@text_state.set_font_size(node.fontsize.value)
|
|
128
|
+
when 'tmx'
|
|
129
|
+
@text_state.set_x(node.tmx.value.to_f)
|
|
130
|
+
when 'tmy'
|
|
131
|
+
@text_state.set_y(node.tmy.value)
|
|
132
|
+
when 'render'
|
|
133
|
+
val = node.render.value
|
|
134
|
+
if(@current_font && @current_font.rendering_mode != val)
|
|
135
|
+
@current_font = @current_font.dup
|
|
136
|
+
@current_font.rendering_mode = val
|
|
137
|
+
@text_state.set_font(@current_font)
|
|
138
|
+
end
|
|
139
|
+
when 'wordspace'
|
|
140
|
+
@text_state.set_word_spacing(node.wordspace.value)
|
|
141
|
+
when 'values'
|
|
142
|
+
scan_tree(node)
|
|
143
|
+
when 'snippet'
|
|
144
|
+
snip(node.snippet.value)
|
|
145
|
+
when 'aposnippet'
|
|
146
|
+
@text_state.step
|
|
147
|
+
snip(node.aposnippet.value)
|
|
148
|
+
when 'linebreak'
|
|
149
|
+
@text_state.step
|
|
150
|
+
when 'textrise'
|
|
151
|
+
#add functionality for textrise p 387 pdf manual
|
|
152
|
+
when 'hexsnippet'
|
|
153
|
+
hex_bytes = node.hexsnippet.value
|
|
154
|
+
char = ''
|
|
155
|
+
hex_bytes.scan(/.{2,4}/n) { |pair|
|
|
156
|
+
dec_byte = pair.hex
|
|
157
|
+
char << (mapped_ascii(dec_byte) || '?')
|
|
158
|
+
}
|
|
159
|
+
_snip(char)
|
|
160
|
+
end
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
end
|
|
164
|
+
def snip(snippet)
|
|
165
|
+
snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
|
|
166
|
+
snippet_text.gsub!(/\\([()])/n, '\1')
|
|
167
|
+
snippet_text.gsub!(/./n) { |char|
|
|
168
|
+
self.mapped_ascii(char[0]) || char
|
|
169
|
+
}
|
|
170
|
+
_snip(snippet_text)
|
|
171
|
+
end
|
|
172
|
+
def _snip(snippet_text)
|
|
173
|
+
@text_state.set_txt(snippet_text)
|
|
174
|
+
@text_state.update!(@current_page ? @current_page.attributes[:rotate] : 0)
|
|
175
|
+
@snippets.push(@text_state.dup).last
|
|
176
|
+
end
|
|
177
|
+
def text_state=(text_state)
|
|
178
|
+
text_state.transformation_matrix = @transformation_matrix
|
|
179
|
+
@text_state = text_state
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
|
5
|
+
#
|
|
6
|
+
# This library is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
# License as published by the Free Software Foundation; either
|
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
# Lesser General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
# License along with this library; if not, write to the Free Software
|
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
#
|
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
|
22
|
+
#
|
|
23
|
+
# TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com
|
|
24
|
+
|
|
25
|
+
module Rpdf2txt
|
|
26
|
+
class PositionedElement
|
|
27
|
+
USER_SPACE = 1000.0
|
|
28
|
+
attr_accessor :media_box, :tmalpha, :tmbeta
|
|
29
|
+
def fire_callbacks(previous_positioned_element, callback_handler)
|
|
30
|
+
end
|
|
31
|
+
def set_x(x)
|
|
32
|
+
@tmx = @dtmx = 0
|
|
33
|
+
@tmxoffset = x.to_f
|
|
34
|
+
end
|
|
35
|
+
def set_xscale(xscale)
|
|
36
|
+
@tmxscale = (xscale.to_f * USER_SPACE).round.to_f / USER_SPACE
|
|
37
|
+
end
|
|
38
|
+
def set_y(y)
|
|
39
|
+
@tmy = 0
|
|
40
|
+
@tmyoffset = y.to_f
|
|
41
|
+
end
|
|
42
|
+
def set_yscale(yscale)
|
|
43
|
+
@tmyscale = (yscale.to_f * USER_SPACE).round.to_f / USER_SPACE
|
|
44
|
+
end
|
|
45
|
+
def transformation_matrix=(tm)
|
|
46
|
+
### This shouldn't happen, but we do have an example of
|
|
47
|
+
### it happening in
|
|
48
|
+
### /Producer (Hyf PDF Output Library 2.2.3 \(Windows\))
|
|
49
|
+
### /Producer (Mac OS X 10.4.6 Quartz PDFContext)
|
|
50
|
+
@cmxscale = (tm[0,0] * USER_SPACE).round.to_f / USER_SPACE
|
|
51
|
+
@cmalpha = tm[0,1]
|
|
52
|
+
@cmbeta = tm[1,0]
|
|
53
|
+
@cmyscale = (tm[1,1] * USER_SPACE).round.to_f / USER_SPACE
|
|
54
|
+
@cmxoffset = tm[2,0]
|
|
55
|
+
@cmyoffset = tm[2,1]
|
|
56
|
+
end
|
|
57
|
+
def whitespace_overlap?(previous)
|
|
58
|
+
false
|
|
59
|
+
end
|
|
60
|
+
def update!(rotation=0)
|
|
61
|
+
orientation = (rotation.to_f.round / 90) % 2
|
|
62
|
+
x, y, x2, y2, bx, by = nil
|
|
63
|
+
if orientation == 1
|
|
64
|
+
x = @tmxoffset + @tmy * @tmalpha
|
|
65
|
+
y = @tmyoffset + (@tmx + @dtmx) * @tmbeta
|
|
66
|
+
x2 = bx = x + @font_size * @tmalpha
|
|
67
|
+
y2 = y + @w * @tmbeta
|
|
68
|
+
by = y + @boxwidth * @tmbeta
|
|
69
|
+
@x = y + @cmxoffset
|
|
70
|
+
@y = x + @cmyoffset
|
|
71
|
+
@x2 = y2 + @cmxoffset
|
|
72
|
+
@y2 = x2 + @cmyoffset
|
|
73
|
+
@right_edge = by + @cmxoffset
|
|
74
|
+
else
|
|
75
|
+
x = @tmxoffset + (@tmx + @dtmx) * @tmxscale
|
|
76
|
+
y = @tmyoffset - @tmy * @tmyscale
|
|
77
|
+
x2 = x + @w * @tmxscale
|
|
78
|
+
y2 = by = y - @font_size * @tmyscale
|
|
79
|
+
bx = x + @boxwidth * @tmxscale
|
|
80
|
+
@x = x + @cmxoffset
|
|
81
|
+
@y = y + @cmyoffset
|
|
82
|
+
@x2 = x2 + @cmxoffset
|
|
83
|
+
@y2 = y2 + @cmyoffset
|
|
84
|
+
@right_edge = bx + @cmxoffset
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
def <=> (other)
|
|
88
|
+
if(same_line(other))
|
|
89
|
+
@x <=> other.x
|
|
90
|
+
elsif(other.is_a?(self.class))
|
|
91
|
+
# @cmyscale may be negative, reversing the sort-order
|
|
92
|
+
(@y <=> other.y) \
|
|
93
|
+
* (@cmyscale == 0 ? 1 : @cmyscale)
|
|
94
|
+
else
|
|
95
|
+
@y <=> other.y
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
class TextState < PositionedElement
|
|
100
|
+
UTF = /utf/in
|
|
101
|
+
attr_accessor :font, :txt
|
|
102
|
+
include Comparable
|
|
103
|
+
attr_reader :y, :x, :x2, :y2, :w, :boxwidth, :xscale, :font_size, :yscale,
|
|
104
|
+
:right_edge
|
|
105
|
+
def initialize(target_encoding='utf8')
|
|
106
|
+
@boxwidth = 0
|
|
107
|
+
@x = @tmx = @dtmx = @tmxoffset = @cmxoffset = 0.0
|
|
108
|
+
@y = @tmy = @tmyoffset = @cmyoffset = 0.0
|
|
109
|
+
@w = 0.0
|
|
110
|
+
@tmalpha = @cmalpha = 0.0
|
|
111
|
+
@tmbeta = @cmbeta = 0.0
|
|
112
|
+
@tmxscale = @cmxscale = 1.0
|
|
113
|
+
@tmyscale = @cmyscale = 1.0
|
|
114
|
+
@lead = nil
|
|
115
|
+
@font = nil
|
|
116
|
+
@font_size = 1
|
|
117
|
+
@char_spacing = 0
|
|
118
|
+
@word_spacing = 0
|
|
119
|
+
@target_encoding = target_encoding + '//TRANSLIT//IGNORE'
|
|
120
|
+
self.transformation_matrix = Matrix[[1,0,0],[0,1,0],[0,0,1]]
|
|
121
|
+
end
|
|
122
|
+
def advance_x(kerning = 0)
|
|
123
|
+
@dtmx += @w - kerning/USER_SPACE
|
|
124
|
+
end
|
|
125
|
+
def char_width(char)
|
|
126
|
+
if(char.is_a? String)
|
|
127
|
+
char = char[0]
|
|
128
|
+
end
|
|
129
|
+
w = 0.0
|
|
130
|
+
if(@font && (width = @font.width(char)))
|
|
131
|
+
w = width
|
|
132
|
+
elsif(@font && (avg = @font.attributes[:avgwidth]))
|
|
133
|
+
w = avg
|
|
134
|
+
end
|
|
135
|
+
w = 300.0 if w == 0
|
|
136
|
+
w += @char_spacing
|
|
137
|
+
if(char==32)
|
|
138
|
+
w += @word_spacing
|
|
139
|
+
end
|
|
140
|
+
w * @font_size / USER_SPACE
|
|
141
|
+
end
|
|
142
|
+
def whitespace_overlap?(previous)
|
|
143
|
+
previous && empty? && same_line(previous) \
|
|
144
|
+
&& previous.x2 >= (@x + (@x2 - @x) / 2)
|
|
145
|
+
end
|
|
146
|
+
def send_content(previous, callback_handler)
|
|
147
|
+
if(previous)
|
|
148
|
+
if(previous.font != @font)
|
|
149
|
+
callback_handler.new_font(@font)
|
|
150
|
+
end
|
|
151
|
+
if(previous.font_height != self.font_height)
|
|
152
|
+
callback_handler.new_fontsize(self.font_height)
|
|
153
|
+
end
|
|
154
|
+
else
|
|
155
|
+
callback_handler.new_font(@font)
|
|
156
|
+
end
|
|
157
|
+
callback_handler.send_flowing_data(@txt)
|
|
158
|
+
end
|
|
159
|
+
def space_width
|
|
160
|
+
w = 300.0
|
|
161
|
+
if(@font && (width = @font.width(32)))
|
|
162
|
+
w = width
|
|
163
|
+
elsif(@font && (avg = @font.attributes[:avgwidth]))
|
|
164
|
+
w = avg
|
|
165
|
+
end
|
|
166
|
+
w += @char_spacing
|
|
167
|
+
w * @font_size / USER_SPACE
|
|
168
|
+
end
|
|
169
|
+
def fire_early_callbacks(previous, callback_handler)
|
|
170
|
+
if(previous)
|
|
171
|
+
if(!same_line(previous))
|
|
172
|
+
callback_handler.send_line_break
|
|
173
|
+
elsif(!same_word(previous))
|
|
174
|
+
if(spaces = previous.count_spaces(@x - previous.x2))
|
|
175
|
+
callback_handler.send_flowing_data(' '*spaces.abs)
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
if(new_paragraph(previous))
|
|
179
|
+
callback_handler.send_paragraph
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
def count_lines(displacement)
|
|
184
|
+
(displacement / lead).abs.ceil
|
|
185
|
+
rescue
|
|
186
|
+
1
|
|
187
|
+
end
|
|
188
|
+
def count_spaces(displacement)
|
|
189
|
+
x = space_width * @tmxscale + @font_size * @tmalpha
|
|
190
|
+
y = @font_size * @tmyscale + space_width * @tmbeta
|
|
191
|
+
width = x * @cmxscale + y * @cmalpha
|
|
192
|
+
if(width.nonzero? && displacement > width)
|
|
193
|
+
(displacement / width).round
|
|
194
|
+
end
|
|
195
|
+
rescue ZeroDivisionError
|
|
196
|
+
warn "Ignoring Division by Zero: #{displacement.inspect}/#{width.inspect}"
|
|
197
|
+
end
|
|
198
|
+
def empty?
|
|
199
|
+
@txt.nil? || @txt.strip.empty?
|
|
200
|
+
end
|
|
201
|
+
def lead
|
|
202
|
+
@lead || -font_height * 1.2
|
|
203
|
+
end
|
|
204
|
+
def font_height
|
|
205
|
+
@font_size
|
|
206
|
+
end
|
|
207
|
+
def new_paragraph(last_text_state)
|
|
208
|
+
return false if(last_text_state.font_size.nil?)
|
|
209
|
+
#1.5 is an approximate value
|
|
210
|
+
spacing = last_text_state.font_height * 1.5
|
|
211
|
+
last_y = last_text_state.y
|
|
212
|
+
((last_y - @y).abs > spacing.abs)
|
|
213
|
+
end
|
|
214
|
+
def recode_txt(txt)
|
|
215
|
+
enc = @font.encoding
|
|
216
|
+
if(enc.is_a?(Encoding))
|
|
217
|
+
# it would certainly be nice to do without all this iconving,
|
|
218
|
+
# but since CMaps always contain utf8, and using utf16 in
|
|
219
|
+
# Symbol.from_* is so much more practical than dealing with
|
|
220
|
+
# variable-length utf8 encoding for the characters in the
|
|
221
|
+
# Symbol font, we'll leave it at dtsttcpw for the moment.
|
|
222
|
+
if(@font.symbol?)
|
|
223
|
+
txt = enc.convert_symbol(txt)
|
|
224
|
+
if(UTF.match(@target_encoding))
|
|
225
|
+
@utf16_iconv ||= Iconv.new(@target_encoding.to_s, 'utf16be')
|
|
226
|
+
txt = @utf16_iconv.iconv(Symbol.to_utf16(txt))
|
|
227
|
+
end
|
|
228
|
+
elsif(tu = @font.to_unicode)
|
|
229
|
+
txt = tu.to_utf8(txt)
|
|
230
|
+
if(UTF.match(@target_encoding))
|
|
231
|
+
@utf8_iconv ||= Iconv.new(@target_encoding.to_s, 'utf8')
|
|
232
|
+
txt = @utf8_iconv.iconv(txt)
|
|
233
|
+
else
|
|
234
|
+
@symbol_iconv ||= Iconv.new('utf16be', 'utf8')
|
|
235
|
+
txt = Symbol.from_utf16(@symbol_iconv.iconv(txt))
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
txt
|
|
239
|
+
# FIXME: fix how encodings and Symbol font are handled
|
|
240
|
+
elsif(UTF.match(enc) && !UTF.match(@target_encoding) && @font.symbol?)
|
|
241
|
+
@symbol_iconv ||= Iconv.new('utf16be', 'utf8')
|
|
242
|
+
txt = Symbol.from_utf16(@symbol_iconv.iconv(txt))
|
|
243
|
+
else
|
|
244
|
+
@iconv ||= Iconv.new(@target_encoding.to_s, enc.to_s)
|
|
245
|
+
@iconv.iconv(txt)
|
|
246
|
+
end
|
|
247
|
+
rescue NoMethodError, Iconv::InvalidEncoding, Iconv::IllegalSequence => e
|
|
248
|
+
txt
|
|
249
|
+
end
|
|
250
|
+
def same_column(other)
|
|
251
|
+
return false unless same_line(other)
|
|
252
|
+
if(other.is_a?(TextState))
|
|
253
|
+
testwidth = other.space_width * 2.0
|
|
254
|
+
width = @x - other.right_edge
|
|
255
|
+
width < testwidth
|
|
256
|
+
else
|
|
257
|
+
false
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
def same_line(other)
|
|
261
|
+
if(other.is_a?(TextState))
|
|
262
|
+
sy1, sy2 = [@y, self.y2].sort
|
|
263
|
+
oy1, oy2 = [other.y, other.y2].sort
|
|
264
|
+
|
|
265
|
+
pair = [[sy1, sy2], [oy1, oy2]].sort
|
|
266
|
+
overlap = pair[0][1] - pair[1][0]
|
|
267
|
+
|
|
268
|
+
[sy2 - sy1, oy2 - oy1].any? { |height|
|
|
269
|
+
# negative overlap means the lines don't touch
|
|
270
|
+
overlap / height > 0.4
|
|
271
|
+
}
|
|
272
|
+
else
|
|
273
|
+
false
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
def same_word(other)
|
|
277
|
+
return false unless same_line(other)
|
|
278
|
+
if(other.is_a?(TextState))
|
|
279
|
+
testwidth = other.space_width / 2.0
|
|
280
|
+
width = @x - other.x2
|
|
281
|
+
width < testwidth
|
|
282
|
+
else
|
|
283
|
+
false
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
def set_font(font)
|
|
287
|
+
@iconv = nil
|
|
288
|
+
@font = font
|
|
289
|
+
end
|
|
290
|
+
def set_font_size(size)
|
|
291
|
+
@font_size = size.to_f
|
|
292
|
+
end
|
|
293
|
+
def set_lead(lead)
|
|
294
|
+
@lead = lead.to_f
|
|
295
|
+
end
|
|
296
|
+
def set_char_spacing(line)
|
|
297
|
+
@char_spacing = line.to_f * USER_SPACE
|
|
298
|
+
end
|
|
299
|
+
def set_txt(txt)
|
|
300
|
+
#call the unescape_txt method,
|
|
301
|
+
#so that \334 is replaced by char �
|
|
302
|
+
#otherwise the calculation of the string width is wrong!!!!
|
|
303
|
+
unescape_txt!(txt)
|
|
304
|
+
@boxwidth = 0
|
|
305
|
+
txt.rstrip.each_byte do |char|
|
|
306
|
+
@boxwidth += char_width(char)
|
|
307
|
+
end
|
|
308
|
+
@w = @boxwidth
|
|
309
|
+
if white = txt[/\s+$/u]
|
|
310
|
+
white.each_byte do |char|
|
|
311
|
+
@w += char_width(char)
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
@txt = recode_txt(txt)
|
|
315
|
+
end
|
|
316
|
+
def set_word_spacing(word_spacing)
|
|
317
|
+
@word_spacing = word_spacing.to_f * USER_SPACE
|
|
318
|
+
end
|
|
319
|
+
def step
|
|
320
|
+
@dtmx = 0
|
|
321
|
+
@tmy -= lead
|
|
322
|
+
end
|
|
323
|
+
def update_x(x_val)
|
|
324
|
+
@dtmx = 0
|
|
325
|
+
@tmx += x_val.to_f
|
|
326
|
+
end
|
|
327
|
+
def update_y(y_val)
|
|
328
|
+
@dtmx = 0
|
|
329
|
+
@tmy -= y_val.to_f
|
|
330
|
+
end
|
|
331
|
+
def unescape_txt!(txt)
|
|
332
|
+
txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr }
|
|
333
|
+
end
|
|
334
|
+
protected
|
|
335
|
+
attr_writer :x
|
|
336
|
+
end
|
|
337
|
+
class NontextElement < PositionedElement
|
|
338
|
+
attr_accessor :current_page
|
|
339
|
+
attr_reader :x, :y, :x2, :y2, :text_state
|
|
340
|
+
def initialize
|
|
341
|
+
@x = @matrix_x = 0.0
|
|
342
|
+
@y = 0.0
|
|
343
|
+
@w = 0.0
|
|
344
|
+
@cmxscale = @tmxscale = 1.0
|
|
345
|
+
@cmyscale = @tmyscale = 1.0
|
|
346
|
+
@xscale = 1.0
|
|
347
|
+
@yscale = 1.0
|
|
348
|
+
@space_width = -1
|
|
349
|
+
super
|
|
350
|
+
end
|
|
351
|
+
def empty?
|
|
352
|
+
false
|
|
353
|
+
end
|
|
354
|
+
def fire_early_callbacks(previous, callback_handler)
|
|
355
|
+
if(previous)
|
|
356
|
+
unless(same_line(previous))
|
|
357
|
+
callback_handler.send_line_break
|
|
358
|
+
end
|
|
359
|
+
if(@font && previous.font != @font)
|
|
360
|
+
callback_handler.new_font(@font)
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
def same_column(other)
|
|
365
|
+
false
|
|
366
|
+
end
|
|
367
|
+
def text_state=(ts)
|
|
368
|
+
@media_box = ts.media_box
|
|
369
|
+
@font = ts.font
|
|
370
|
+
@text_state = ts.dup
|
|
371
|
+
end
|
|
372
|
+
def method_missing(name, *args, &block)
|
|
373
|
+
@text_state.send(name, *args, &block)
|
|
374
|
+
end
|
|
375
|
+
end
|
|
376
|
+
class HorizontalRule < NontextElement
|
|
377
|
+
alias :x2 :x
|
|
378
|
+
alias :y2 :y
|
|
379
|
+
def initialize(x, y, dm)
|
|
380
|
+
super()
|
|
381
|
+
self.transformation_matrix = dm
|
|
382
|
+
set_x(x)
|
|
383
|
+
set_y(y)
|
|
384
|
+
end
|
|
385
|
+
def send_content(previous, callback_handler)
|
|
386
|
+
if(previous && !same_line(previous))
|
|
387
|
+
callback_handler.send_hr
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
def same_line(other)
|
|
391
|
+
if(other.is_a?(HorizontalRule))
|
|
392
|
+
(other.y - @y).abs < 10
|
|
393
|
+
else
|
|
394
|
+
false
|
|
395
|
+
end
|
|
396
|
+
end
|
|
397
|
+
def <=> other
|
|
398
|
+
if other.is_a?(HorizontalRule) && same_line(other)
|
|
399
|
+
0
|
|
400
|
+
else
|
|
401
|
+
super
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
end
|
|
405
|
+
class ImagePlacement < NontextElement
|
|
406
|
+
attr_reader :resource
|
|
407
|
+
def initialize(resource, x, y, dm)
|
|
408
|
+
super()
|
|
409
|
+
case resource
|
|
410
|
+
when InlineImage
|
|
411
|
+
@xobject = resource
|
|
412
|
+
else
|
|
413
|
+
@resource = resource.downcase[1..-1].to_sym
|
|
414
|
+
end
|
|
415
|
+
self.transformation_matrix = dm
|
|
416
|
+
@x = x
|
|
417
|
+
@y = y - @cmyscale / 2
|
|
418
|
+
end
|
|
419
|
+
def image
|
|
420
|
+
xobject.image
|
|
421
|
+
end
|
|
422
|
+
def same_line(other)
|
|
423
|
+
@y == other.y
|
|
424
|
+
end
|
|
425
|
+
def send_content(previous, callback_handler)
|
|
426
|
+
if img = xobject
|
|
427
|
+
callback_handler.send_image self
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
def xobject
|
|
431
|
+
@xobject ||= @current_page.resources.xobject(@resource)
|
|
432
|
+
end
|
|
433
|
+
end
|
|
434
|
+
end
|