rpdf2txt 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,182 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# Text -- Rpdf2txt -- 28.11.2002 -- aschrafl@ywesee.com
|
24
|
+
|
25
|
+
require 'rpdf2txt/text_state'
|
26
|
+
require 'rpdf2txt/textparser'
|
27
|
+
require 'rpdf2txt/object'
|
28
|
+
require 'iconv'
|
29
|
+
|
30
|
+
module Rpdf2txt
|
31
|
+
class Text
|
32
|
+
attr_writer :current_page
|
33
|
+
attr_reader :text_state, :transformation_matrix
|
34
|
+
def initialize(src, target_encoding='utf8',
|
35
|
+
tm=Matrix[[1,0,0],[0,-1,0],[0,0,1]])
|
36
|
+
@src = src.gsub(/[\r\n]+/n, "\n")
|
37
|
+
@text_state = TextState.new(target_encoding)
|
38
|
+
@transformation_matrix = tm
|
39
|
+
@text_state.transformation_matrix = tm
|
40
|
+
end
|
41
|
+
## FIXME: generic_symbol_font is a workaround. Implement a way to
|
42
|
+
## pass unicode-snippets (or rework everything to unicode)
|
43
|
+
def generic_symbol_font(font)
|
44
|
+
if(font.nil?)
|
45
|
+
Font.new('<< /BaseFont /Symbol')
|
46
|
+
elsif(/symbol/in.match(font.basefont_name))
|
47
|
+
font
|
48
|
+
else
|
49
|
+
genfont = font.dup
|
50
|
+
genfont.attributes[:basefont] = 'Symbol'
|
51
|
+
genfont
|
52
|
+
end
|
53
|
+
end
|
54
|
+
def get_font(font_name)
|
55
|
+
return nil unless @current_page
|
56
|
+
@current_page.font(font_name.to_s.downcase.intern)
|
57
|
+
end
|
58
|
+
def mapped_ascii(ascii)
|
59
|
+
if(@current_font)
|
60
|
+
if((cmap = @current_font.cmap) && (map = cmap.map) \
|
61
|
+
&& (unicode_bytes = map[ascii]) \
|
62
|
+
&& (ascii = SymbolMap::SYMBOL_ENTITIES[unicode_bytes]))
|
63
|
+
ascii.chr
|
64
|
+
elsif((map = @current_font.to_unicode) \
|
65
|
+
&& (utf8 = map.to_utf8(ascii)))
|
66
|
+
@current_font.attributes[:encoding] = '/UTF8'
|
67
|
+
#@text_state.set_font(@current_font)
|
68
|
+
[utf8].pack('U')
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
def scan
|
73
|
+
@snippets = []
|
74
|
+
ast = Rpdf2txt.text_parser.parse(@src)
|
75
|
+
scan_tree(ast)
|
76
|
+
@snippets
|
77
|
+
rescue Exception
|
78
|
+
puts @src
|
79
|
+
raise
|
80
|
+
end
|
81
|
+
def scan_tree(ast)
|
82
|
+
ast.values.each { |node|
|
83
|
+
if(node.name == 'Array') \
|
84
|
+
&& (node.values.first.children_names.first == 'kerning')
|
85
|
+
## If the case [ 34 (foo) ] crops up, the first operation
|
86
|
+
## executed on @text_state is advance_x. This results in
|
87
|
+
## the width of the last text-snipped being calculated twice.
|
88
|
+
## This here is a workaround that resets the snippet to an
|
89
|
+
## empty string if we are encountering a [ ??? ] construct
|
90
|
+
## (an array).
|
91
|
+
## TODO: find a more general solution
|
92
|
+
@text_state.set_txt('')
|
93
|
+
end
|
94
|
+
node.children_names.each { |child_name|
|
95
|
+
case child_name
|
96
|
+
when 'alpha'
|
97
|
+
@text_state.tmalpha = node.alpha.value.to_f
|
98
|
+
when 'beta'
|
99
|
+
@text_state.tmbeta = -node.beta.value.to_f
|
100
|
+
skew = node.beta.value.to_f > 0.1
|
101
|
+
if(@current_font && @current_font.skewed != skew)
|
102
|
+
@current_font = @current_font.dup
|
103
|
+
@current_font.skewed = skew
|
104
|
+
@text_state.set_font(@current_font)
|
105
|
+
end
|
106
|
+
when 'xscale'
|
107
|
+
@text_state.set_xscale(node.xscale.value)
|
108
|
+
when 'yscale'
|
109
|
+
@text_state.set_yscale(node.yscale.value)
|
110
|
+
when 'charspace'
|
111
|
+
@text_state.set_char_spacing(node.charspace.value)
|
112
|
+
when 'kerning'
|
113
|
+
@text_state.advance_x(node.kerning.value.to_f)
|
114
|
+
when 'tdleadx'
|
115
|
+
@text_state.update_x(node.tdleadx.value.to_f)
|
116
|
+
when 'tdleady'
|
117
|
+
lead = node.tdleady.value.to_f
|
118
|
+
@text_state.set_lead(lead)
|
119
|
+
@text_state.update_y(lead)
|
120
|
+
when 'xpos'
|
121
|
+
@text_state.update_x(node.xpos.value.to_f)
|
122
|
+
when 'ypos'
|
123
|
+
@text_state.update_y(node.ypos.value.to_f)
|
124
|
+
when 'fontname'
|
125
|
+
@current_font = get_font(node.fontname.value)
|
126
|
+
@text_state.set_font(@current_font)
|
127
|
+
@text_state.set_font_size(node.fontsize.value)
|
128
|
+
when 'tmx'
|
129
|
+
@text_state.set_x(node.tmx.value.to_f)
|
130
|
+
when 'tmy'
|
131
|
+
@text_state.set_y(node.tmy.value)
|
132
|
+
when 'render'
|
133
|
+
val = node.render.value
|
134
|
+
if(@current_font && @current_font.rendering_mode != val)
|
135
|
+
@current_font = @current_font.dup
|
136
|
+
@current_font.rendering_mode = val
|
137
|
+
@text_state.set_font(@current_font)
|
138
|
+
end
|
139
|
+
when 'wordspace'
|
140
|
+
@text_state.set_word_spacing(node.wordspace.value)
|
141
|
+
when 'values'
|
142
|
+
scan_tree(node)
|
143
|
+
when 'snippet'
|
144
|
+
snip(node.snippet.value)
|
145
|
+
when 'aposnippet'
|
146
|
+
@text_state.step
|
147
|
+
snip(node.aposnippet.value)
|
148
|
+
when 'linebreak'
|
149
|
+
@text_state.step
|
150
|
+
when 'textrise'
|
151
|
+
#add functionality for textrise p 387 pdf manual
|
152
|
+
when 'hexsnippet'
|
153
|
+
hex_bytes = node.hexsnippet.value
|
154
|
+
char = ''
|
155
|
+
hex_bytes.scan(/.{2,4}/n) { |pair|
|
156
|
+
dec_byte = pair.hex
|
157
|
+
char << (mapped_ascii(dec_byte) || '?')
|
158
|
+
}
|
159
|
+
_snip(char)
|
160
|
+
end
|
161
|
+
}
|
162
|
+
}
|
163
|
+
end
|
164
|
+
def snip(snippet)
|
165
|
+
snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
|
166
|
+
snippet_text.gsub!(/\\([()])/n, '\1')
|
167
|
+
snippet_text.gsub!(/./n) { |char|
|
168
|
+
self.mapped_ascii(char[0]) || char
|
169
|
+
}
|
170
|
+
_snip(snippet_text)
|
171
|
+
end
|
172
|
+
def _snip(snippet_text)
|
173
|
+
@text_state.set_txt(snippet_text)
|
174
|
+
@text_state.update!(@current_page ? @current_page.attributes[:rotate] : 0)
|
175
|
+
@snippets.push(@text_state.dup).last
|
176
|
+
end
|
177
|
+
def text_state=(text_state)
|
178
|
+
text_state.transformation_matrix = @transformation_matrix
|
179
|
+
@text_state = text_state
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
@@ -0,0 +1,434 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com
|
24
|
+
|
25
|
+
module Rpdf2txt
|
26
|
+
class PositionedElement
|
27
|
+
USER_SPACE = 1000.0
|
28
|
+
attr_accessor :media_box, :tmalpha, :tmbeta
|
29
|
+
def fire_callbacks(previous_positioned_element, callback_handler)
|
30
|
+
end
|
31
|
+
def set_x(x)
|
32
|
+
@tmx = @dtmx = 0
|
33
|
+
@tmxoffset = x.to_f
|
34
|
+
end
|
35
|
+
def set_xscale(xscale)
|
36
|
+
@tmxscale = (xscale.to_f * USER_SPACE).round.to_f / USER_SPACE
|
37
|
+
end
|
38
|
+
def set_y(y)
|
39
|
+
@tmy = 0
|
40
|
+
@tmyoffset = y.to_f
|
41
|
+
end
|
42
|
+
def set_yscale(yscale)
|
43
|
+
@tmyscale = (yscale.to_f * USER_SPACE).round.to_f / USER_SPACE
|
44
|
+
end
|
45
|
+
def transformation_matrix=(tm)
|
46
|
+
### This shouldn't happen, but we do have an example of
|
47
|
+
### it happening in
|
48
|
+
### /Producer (Hyf PDF Output Library 2.2.3 \(Windows\))
|
49
|
+
### /Producer (Mac OS X 10.4.6 Quartz PDFContext)
|
50
|
+
@cmxscale = (tm[0,0] * USER_SPACE).round.to_f / USER_SPACE
|
51
|
+
@cmalpha = tm[0,1]
|
52
|
+
@cmbeta = tm[1,0]
|
53
|
+
@cmyscale = (tm[1,1] * USER_SPACE).round.to_f / USER_SPACE
|
54
|
+
@cmxoffset = tm[2,0]
|
55
|
+
@cmyoffset = tm[2,1]
|
56
|
+
end
|
57
|
+
def whitespace_overlap?(previous)
|
58
|
+
false
|
59
|
+
end
|
60
|
+
def update!(rotation=0)
|
61
|
+
orientation = (rotation.to_f.round / 90) % 2
|
62
|
+
x, y, x2, y2, bx, by = nil
|
63
|
+
if orientation == 1
|
64
|
+
x = @tmxoffset + @tmy * @tmalpha
|
65
|
+
y = @tmyoffset + (@tmx + @dtmx) * @tmbeta
|
66
|
+
x2 = bx = x + @font_size * @tmalpha
|
67
|
+
y2 = y + @w * @tmbeta
|
68
|
+
by = y + @boxwidth * @tmbeta
|
69
|
+
@x = y + @cmxoffset
|
70
|
+
@y = x + @cmyoffset
|
71
|
+
@x2 = y2 + @cmxoffset
|
72
|
+
@y2 = x2 + @cmyoffset
|
73
|
+
@right_edge = by + @cmxoffset
|
74
|
+
else
|
75
|
+
x = @tmxoffset + (@tmx + @dtmx) * @tmxscale
|
76
|
+
y = @tmyoffset - @tmy * @tmyscale
|
77
|
+
x2 = x + @w * @tmxscale
|
78
|
+
y2 = by = y - @font_size * @tmyscale
|
79
|
+
bx = x + @boxwidth * @tmxscale
|
80
|
+
@x = x + @cmxoffset
|
81
|
+
@y = y + @cmyoffset
|
82
|
+
@x2 = x2 + @cmxoffset
|
83
|
+
@y2 = y2 + @cmyoffset
|
84
|
+
@right_edge = bx + @cmxoffset
|
85
|
+
end
|
86
|
+
end
|
87
|
+
def <=> (other)
|
88
|
+
if(same_line(other))
|
89
|
+
@x <=> other.x
|
90
|
+
elsif(other.is_a?(self.class))
|
91
|
+
# @cmyscale may be negative, reversing the sort-order
|
92
|
+
(@y <=> other.y) \
|
93
|
+
* (@cmyscale == 0 ? 1 : @cmyscale)
|
94
|
+
else
|
95
|
+
@y <=> other.y
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
class TextState < PositionedElement
|
100
|
+
UTF = /utf/in
|
101
|
+
attr_accessor :font, :txt
|
102
|
+
include Comparable
|
103
|
+
attr_reader :y, :x, :x2, :y2, :w, :boxwidth, :xscale, :font_size, :yscale,
|
104
|
+
:right_edge
|
105
|
+
def initialize(target_encoding='utf8')
|
106
|
+
@boxwidth = 0
|
107
|
+
@x = @tmx = @dtmx = @tmxoffset = @cmxoffset = 0.0
|
108
|
+
@y = @tmy = @tmyoffset = @cmyoffset = 0.0
|
109
|
+
@w = 0.0
|
110
|
+
@tmalpha = @cmalpha = 0.0
|
111
|
+
@tmbeta = @cmbeta = 0.0
|
112
|
+
@tmxscale = @cmxscale = 1.0
|
113
|
+
@tmyscale = @cmyscale = 1.0
|
114
|
+
@lead = nil
|
115
|
+
@font = nil
|
116
|
+
@font_size = 1
|
117
|
+
@char_spacing = 0
|
118
|
+
@word_spacing = 0
|
119
|
+
@target_encoding = target_encoding + '//TRANSLIT//IGNORE'
|
120
|
+
self.transformation_matrix = Matrix[[1,0,0],[0,1,0],[0,0,1]]
|
121
|
+
end
|
122
|
+
def advance_x(kerning = 0)
|
123
|
+
@dtmx += @w - kerning/USER_SPACE
|
124
|
+
end
|
125
|
+
def char_width(char)
|
126
|
+
if(char.is_a? String)
|
127
|
+
char = char[0]
|
128
|
+
end
|
129
|
+
w = 0.0
|
130
|
+
if(@font && (width = @font.width(char)))
|
131
|
+
w = width
|
132
|
+
elsif(@font && (avg = @font.attributes[:avgwidth]))
|
133
|
+
w = avg
|
134
|
+
end
|
135
|
+
w = 300.0 if w == 0
|
136
|
+
w += @char_spacing
|
137
|
+
if(char==32)
|
138
|
+
w += @word_spacing
|
139
|
+
end
|
140
|
+
w * @font_size / USER_SPACE
|
141
|
+
end
|
142
|
+
def whitespace_overlap?(previous)
|
143
|
+
previous && empty? && same_line(previous) \
|
144
|
+
&& previous.x2 >= (@x + (@x2 - @x) / 2)
|
145
|
+
end
|
146
|
+
def send_content(previous, callback_handler)
|
147
|
+
if(previous)
|
148
|
+
if(previous.font != @font)
|
149
|
+
callback_handler.new_font(@font)
|
150
|
+
end
|
151
|
+
if(previous.font_height != self.font_height)
|
152
|
+
callback_handler.new_fontsize(self.font_height)
|
153
|
+
end
|
154
|
+
else
|
155
|
+
callback_handler.new_font(@font)
|
156
|
+
end
|
157
|
+
callback_handler.send_flowing_data(@txt)
|
158
|
+
end
|
159
|
+
def space_width
|
160
|
+
w = 300.0
|
161
|
+
if(@font && (width = @font.width(32)))
|
162
|
+
w = width
|
163
|
+
elsif(@font && (avg = @font.attributes[:avgwidth]))
|
164
|
+
w = avg
|
165
|
+
end
|
166
|
+
w += @char_spacing
|
167
|
+
w * @font_size / USER_SPACE
|
168
|
+
end
|
169
|
+
def fire_early_callbacks(previous, callback_handler)
|
170
|
+
if(previous)
|
171
|
+
if(!same_line(previous))
|
172
|
+
callback_handler.send_line_break
|
173
|
+
elsif(!same_word(previous))
|
174
|
+
if(spaces = previous.count_spaces(@x - previous.x2))
|
175
|
+
callback_handler.send_flowing_data(' '*spaces.abs)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
if(new_paragraph(previous))
|
179
|
+
callback_handler.send_paragraph
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
def count_lines(displacement)
|
184
|
+
(displacement / lead).abs.ceil
|
185
|
+
rescue
|
186
|
+
1
|
187
|
+
end
|
188
|
+
def count_spaces(displacement)
|
189
|
+
x = space_width * @tmxscale + @font_size * @tmalpha
|
190
|
+
y = @font_size * @tmyscale + space_width * @tmbeta
|
191
|
+
width = x * @cmxscale + y * @cmalpha
|
192
|
+
if(width.nonzero? && displacement > width)
|
193
|
+
(displacement / width).round
|
194
|
+
end
|
195
|
+
rescue ZeroDivisionError
|
196
|
+
warn "Ignoring Division by Zero: #{displacement.inspect}/#{width.inspect}"
|
197
|
+
end
|
198
|
+
def empty?
|
199
|
+
@txt.nil? || @txt.strip.empty?
|
200
|
+
end
|
201
|
+
def lead
|
202
|
+
@lead || -font_height * 1.2
|
203
|
+
end
|
204
|
+
def font_height
|
205
|
+
@font_size
|
206
|
+
end
|
207
|
+
def new_paragraph(last_text_state)
|
208
|
+
return false if(last_text_state.font_size.nil?)
|
209
|
+
#1.5 is an approximate value
|
210
|
+
spacing = last_text_state.font_height * 1.5
|
211
|
+
last_y = last_text_state.y
|
212
|
+
((last_y - @y).abs > spacing.abs)
|
213
|
+
end
|
214
|
+
def recode_txt(txt)
|
215
|
+
enc = @font.encoding
|
216
|
+
if(enc.is_a?(Encoding))
|
217
|
+
# it would certainly be nice to do without all this iconving,
|
218
|
+
# but since CMaps always contain utf8, and using utf16 in
|
219
|
+
# Symbol.from_* is so much more practical than dealing with
|
220
|
+
# variable-length utf8 encoding for the characters in the
|
221
|
+
# Symbol font, we'll leave it at dtsttcpw for the moment.
|
222
|
+
if(@font.symbol?)
|
223
|
+
txt = enc.convert_symbol(txt)
|
224
|
+
if(UTF.match(@target_encoding))
|
225
|
+
@utf16_iconv ||= Iconv.new(@target_encoding.to_s, 'utf16be')
|
226
|
+
txt = @utf16_iconv.iconv(Symbol.to_utf16(txt))
|
227
|
+
end
|
228
|
+
elsif(tu = @font.to_unicode)
|
229
|
+
txt = tu.to_utf8(txt)
|
230
|
+
if(UTF.match(@target_encoding))
|
231
|
+
@utf8_iconv ||= Iconv.new(@target_encoding.to_s, 'utf8')
|
232
|
+
txt = @utf8_iconv.iconv(txt)
|
233
|
+
else
|
234
|
+
@symbol_iconv ||= Iconv.new('utf16be', 'utf8')
|
235
|
+
txt = Symbol.from_utf16(@symbol_iconv.iconv(txt))
|
236
|
+
end
|
237
|
+
end
|
238
|
+
txt
|
239
|
+
# FIXME: fix how encodings and Symbol font are handled
|
240
|
+
elsif(UTF.match(enc) && !UTF.match(@target_encoding) && @font.symbol?)
|
241
|
+
@symbol_iconv ||= Iconv.new('utf16be', 'utf8')
|
242
|
+
txt = Symbol.from_utf16(@symbol_iconv.iconv(txt))
|
243
|
+
else
|
244
|
+
@iconv ||= Iconv.new(@target_encoding.to_s, enc.to_s)
|
245
|
+
@iconv.iconv(txt)
|
246
|
+
end
|
247
|
+
rescue NoMethodError, Iconv::InvalidEncoding, Iconv::IllegalSequence => e
|
248
|
+
txt
|
249
|
+
end
|
250
|
+
def same_column(other)
|
251
|
+
return false unless same_line(other)
|
252
|
+
if(other.is_a?(TextState))
|
253
|
+
testwidth = other.space_width * 2.0
|
254
|
+
width = @x - other.right_edge
|
255
|
+
width < testwidth
|
256
|
+
else
|
257
|
+
false
|
258
|
+
end
|
259
|
+
end
|
260
|
+
def same_line(other)
|
261
|
+
if(other.is_a?(TextState))
|
262
|
+
sy1, sy2 = [@y, self.y2].sort
|
263
|
+
oy1, oy2 = [other.y, other.y2].sort
|
264
|
+
|
265
|
+
pair = [[sy1, sy2], [oy1, oy2]].sort
|
266
|
+
overlap = pair[0][1] - pair[1][0]
|
267
|
+
|
268
|
+
[sy2 - sy1, oy2 - oy1].any? { |height|
|
269
|
+
# negative overlap means the lines don't touch
|
270
|
+
overlap / height > 0.4
|
271
|
+
}
|
272
|
+
else
|
273
|
+
false
|
274
|
+
end
|
275
|
+
end
|
276
|
+
def same_word(other)
|
277
|
+
return false unless same_line(other)
|
278
|
+
if(other.is_a?(TextState))
|
279
|
+
testwidth = other.space_width / 2.0
|
280
|
+
width = @x - other.x2
|
281
|
+
width < testwidth
|
282
|
+
else
|
283
|
+
false
|
284
|
+
end
|
285
|
+
end
|
286
|
+
def set_font(font)
|
287
|
+
@iconv = nil
|
288
|
+
@font = font
|
289
|
+
end
|
290
|
+
def set_font_size(size)
|
291
|
+
@font_size = size.to_f
|
292
|
+
end
|
293
|
+
def set_lead(lead)
|
294
|
+
@lead = lead.to_f
|
295
|
+
end
|
296
|
+
def set_char_spacing(line)
|
297
|
+
@char_spacing = line.to_f * USER_SPACE
|
298
|
+
end
|
299
|
+
def set_txt(txt)
|
300
|
+
#call the unescape_txt method,
|
301
|
+
#so that \334 is replaced by char �
|
302
|
+
#otherwise the calculation of the string width is wrong!!!!
|
303
|
+
unescape_txt!(txt)
|
304
|
+
@boxwidth = 0
|
305
|
+
txt.rstrip.each_byte do |char|
|
306
|
+
@boxwidth += char_width(char)
|
307
|
+
end
|
308
|
+
@w = @boxwidth
|
309
|
+
if white = txt[/\s+$/u]
|
310
|
+
white.each_byte do |char|
|
311
|
+
@w += char_width(char)
|
312
|
+
end
|
313
|
+
end
|
314
|
+
@txt = recode_txt(txt)
|
315
|
+
end
|
316
|
+
def set_word_spacing(word_spacing)
|
317
|
+
@word_spacing = word_spacing.to_f * USER_SPACE
|
318
|
+
end
|
319
|
+
def step
|
320
|
+
@dtmx = 0
|
321
|
+
@tmy -= lead
|
322
|
+
end
|
323
|
+
def update_x(x_val)
|
324
|
+
@dtmx = 0
|
325
|
+
@tmx += x_val.to_f
|
326
|
+
end
|
327
|
+
def update_y(y_val)
|
328
|
+
@dtmx = 0
|
329
|
+
@tmy -= y_val.to_f
|
330
|
+
end
|
331
|
+
def unescape_txt!(txt)
|
332
|
+
txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr }
|
333
|
+
end
|
334
|
+
protected
|
335
|
+
attr_writer :x
|
336
|
+
end
|
337
|
+
class NontextElement < PositionedElement
|
338
|
+
attr_accessor :current_page
|
339
|
+
attr_reader :x, :y, :x2, :y2, :text_state
|
340
|
+
def initialize
|
341
|
+
@x = @matrix_x = 0.0
|
342
|
+
@y = 0.0
|
343
|
+
@w = 0.0
|
344
|
+
@cmxscale = @tmxscale = 1.0
|
345
|
+
@cmyscale = @tmyscale = 1.0
|
346
|
+
@xscale = 1.0
|
347
|
+
@yscale = 1.0
|
348
|
+
@space_width = -1
|
349
|
+
super
|
350
|
+
end
|
351
|
+
def empty?
|
352
|
+
false
|
353
|
+
end
|
354
|
+
def fire_early_callbacks(previous, callback_handler)
|
355
|
+
if(previous)
|
356
|
+
unless(same_line(previous))
|
357
|
+
callback_handler.send_line_break
|
358
|
+
end
|
359
|
+
if(@font && previous.font != @font)
|
360
|
+
callback_handler.new_font(@font)
|
361
|
+
end
|
362
|
+
end
|
363
|
+
end
|
364
|
+
def same_column(other)
|
365
|
+
false
|
366
|
+
end
|
367
|
+
def text_state=(ts)
|
368
|
+
@media_box = ts.media_box
|
369
|
+
@font = ts.font
|
370
|
+
@text_state = ts.dup
|
371
|
+
end
|
372
|
+
def method_missing(name, *args, &block)
|
373
|
+
@text_state.send(name, *args, &block)
|
374
|
+
end
|
375
|
+
end
|
376
|
+
class HorizontalRule < NontextElement
|
377
|
+
alias :x2 :x
|
378
|
+
alias :y2 :y
|
379
|
+
def initialize(x, y, dm)
|
380
|
+
super()
|
381
|
+
self.transformation_matrix = dm
|
382
|
+
set_x(x)
|
383
|
+
set_y(y)
|
384
|
+
end
|
385
|
+
def send_content(previous, callback_handler)
|
386
|
+
if(previous && !same_line(previous))
|
387
|
+
callback_handler.send_hr
|
388
|
+
end
|
389
|
+
end
|
390
|
+
def same_line(other)
|
391
|
+
if(other.is_a?(HorizontalRule))
|
392
|
+
(other.y - @y).abs < 10
|
393
|
+
else
|
394
|
+
false
|
395
|
+
end
|
396
|
+
end
|
397
|
+
def <=> other
|
398
|
+
if other.is_a?(HorizontalRule) && same_line(other)
|
399
|
+
0
|
400
|
+
else
|
401
|
+
super
|
402
|
+
end
|
403
|
+
end
|
404
|
+
end
|
405
|
+
class ImagePlacement < NontextElement
|
406
|
+
attr_reader :resource
|
407
|
+
def initialize(resource, x, y, dm)
|
408
|
+
super()
|
409
|
+
case resource
|
410
|
+
when InlineImage
|
411
|
+
@xobject = resource
|
412
|
+
else
|
413
|
+
@resource = resource.downcase[1..-1].to_sym
|
414
|
+
end
|
415
|
+
self.transformation_matrix = dm
|
416
|
+
@x = x
|
417
|
+
@y = y - @cmyscale / 2
|
418
|
+
end
|
419
|
+
def image
|
420
|
+
xobject.image
|
421
|
+
end
|
422
|
+
def same_line(other)
|
423
|
+
@y == other.y
|
424
|
+
end
|
425
|
+
def send_content(previous, callback_handler)
|
426
|
+
if img = xobject
|
427
|
+
callback_handler.send_image self
|
428
|
+
end
|
429
|
+
end
|
430
|
+
def xobject
|
431
|
+
@xobject ||= @current_page.resources.xobject(@resource)
|
432
|
+
end
|
433
|
+
end
|
434
|
+
end
|