rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,182 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # Text -- Rpdf2txt -- 28.11.2002 -- aschrafl@ywesee.com
24
+
25
+ require 'rpdf2txt/text_state'
26
+ require 'rpdf2txt/textparser'
27
+ require 'rpdf2txt/object'
28
+ require 'iconv'
29
+
30
+ module Rpdf2txt
31
+ class Text
32
+ attr_writer :current_page
33
+ attr_reader :text_state, :transformation_matrix
34
+ def initialize(src, target_encoding='utf8',
35
+ tm=Matrix[[1,0,0],[0,-1,0],[0,0,1]])
36
+ @src = src.gsub(/[\r\n]+/n, "\n")
37
+ @text_state = TextState.new(target_encoding)
38
+ @transformation_matrix = tm
39
+ @text_state.transformation_matrix = tm
40
+ end
41
+ ## FIXME: generic_symbol_font is a workaround. Implement a way to
42
+ ## pass unicode-snippets (or rework everything to unicode)
43
+ def generic_symbol_font(font)
44
+ if(font.nil?)
45
+ Font.new('<< /BaseFont /Symbol')
46
+ elsif(/symbol/in.match(font.basefont_name))
47
+ font
48
+ else
49
+ genfont = font.dup
50
+ genfont.attributes[:basefont] = 'Symbol'
51
+ genfont
52
+ end
53
+ end
54
+ def get_font(font_name)
55
+ return nil unless @current_page
56
+ @current_page.font(font_name.to_s.downcase.intern)
57
+ end
58
+ def mapped_ascii(ascii)
59
+ if(@current_font)
60
+ if((cmap = @current_font.cmap) && (map = cmap.map) \
61
+ && (unicode_bytes = map[ascii]) \
62
+ && (ascii = SymbolMap::SYMBOL_ENTITIES[unicode_bytes]))
63
+ ascii.chr
64
+ elsif((map = @current_font.to_unicode) \
65
+ && (utf8 = map.to_utf8(ascii)))
66
+ @current_font.attributes[:encoding] = '/UTF8'
67
+ #@text_state.set_font(@current_font)
68
+ [utf8].pack('U')
69
+ end
70
+ end
71
+ end
72
+ def scan
73
+ @snippets = []
74
+ ast = Rpdf2txt.text_parser.parse(@src)
75
+ scan_tree(ast)
76
+ @snippets
77
+ rescue Exception
78
+ puts @src
79
+ raise
80
+ end
81
+ def scan_tree(ast)
82
+ ast.values.each { |node|
83
+ if(node.name == 'Array') \
84
+ && (node.values.first.children_names.first == 'kerning')
85
+ ## If the case [ 34 (foo) ] crops up, the first operation
86
+ ## executed on @text_state is advance_x. This results in
87
+ ## the width of the last text-snipped being calculated twice.
88
+ ## This here is a workaround that resets the snippet to an
89
+ ## empty string if we are encountering a [ ??? ] construct
90
+ ## (an array).
91
+ ## TODO: find a more general solution
92
+ @text_state.set_txt('')
93
+ end
94
+ node.children_names.each { |child_name|
95
+ case child_name
96
+ when 'alpha'
97
+ @text_state.tmalpha = node.alpha.value.to_f
98
+ when 'beta'
99
+ @text_state.tmbeta = -node.beta.value.to_f
100
+ skew = node.beta.value.to_f > 0.1
101
+ if(@current_font && @current_font.skewed != skew)
102
+ @current_font = @current_font.dup
103
+ @current_font.skewed = skew
104
+ @text_state.set_font(@current_font)
105
+ end
106
+ when 'xscale'
107
+ @text_state.set_xscale(node.xscale.value)
108
+ when 'yscale'
109
+ @text_state.set_yscale(node.yscale.value)
110
+ when 'charspace'
111
+ @text_state.set_char_spacing(node.charspace.value)
112
+ when 'kerning'
113
+ @text_state.advance_x(node.kerning.value.to_f)
114
+ when 'tdleadx'
115
+ @text_state.update_x(node.tdleadx.value.to_f)
116
+ when 'tdleady'
117
+ lead = node.tdleady.value.to_f
118
+ @text_state.set_lead(lead)
119
+ @text_state.update_y(lead)
120
+ when 'xpos'
121
+ @text_state.update_x(node.xpos.value.to_f)
122
+ when 'ypos'
123
+ @text_state.update_y(node.ypos.value.to_f)
124
+ when 'fontname'
125
+ @current_font = get_font(node.fontname.value)
126
+ @text_state.set_font(@current_font)
127
+ @text_state.set_font_size(node.fontsize.value)
128
+ when 'tmx'
129
+ @text_state.set_x(node.tmx.value.to_f)
130
+ when 'tmy'
131
+ @text_state.set_y(node.tmy.value)
132
+ when 'render'
133
+ val = node.render.value
134
+ if(@current_font && @current_font.rendering_mode != val)
135
+ @current_font = @current_font.dup
136
+ @current_font.rendering_mode = val
137
+ @text_state.set_font(@current_font)
138
+ end
139
+ when 'wordspace'
140
+ @text_state.set_word_spacing(node.wordspace.value)
141
+ when 'values'
142
+ scan_tree(node)
143
+ when 'snippet'
144
+ snip(node.snippet.value)
145
+ when 'aposnippet'
146
+ @text_state.step
147
+ snip(node.aposnippet.value)
148
+ when 'linebreak'
149
+ @text_state.step
150
+ when 'textrise'
151
+ #add functionality for textrise p 387 pdf manual
152
+ when 'hexsnippet'
153
+ hex_bytes = node.hexsnippet.value
154
+ char = ''
155
+ hex_bytes.scan(/.{2,4}/n) { |pair|
156
+ dec_byte = pair.hex
157
+ char << (mapped_ascii(dec_byte) || '?')
158
+ }
159
+ _snip(char)
160
+ end
161
+ }
162
+ }
163
+ end
164
+ def snip(snippet)
165
+ snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
166
+ snippet_text.gsub!(/\\([()])/n, '\1')
167
+ snippet_text.gsub!(/./n) { |char|
168
+ self.mapped_ascii(char[0]) || char
169
+ }
170
+ _snip(snippet_text)
171
+ end
172
+ def _snip(snippet_text)
173
+ @text_state.set_txt(snippet_text)
174
+ @text_state.update!(@current_page ? @current_page.attributes[:rotate] : 0)
175
+ @snippets.push(@text_state.dup).last
176
+ end
177
+ def text_state=(text_state)
178
+ text_state.transformation_matrix = @transformation_matrix
179
+ @text_state = text_state
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,434 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com
24
+
25
+ module Rpdf2txt
26
+ class PositionedElement
27
+ USER_SPACE = 1000.0
28
+ attr_accessor :media_box, :tmalpha, :tmbeta
29
+ def fire_callbacks(previous_positioned_element, callback_handler)
30
+ end
31
+ def set_x(x)
32
+ @tmx = @dtmx = 0
33
+ @tmxoffset = x.to_f
34
+ end
35
+ def set_xscale(xscale)
36
+ @tmxscale = (xscale.to_f * USER_SPACE).round.to_f / USER_SPACE
37
+ end
38
+ def set_y(y)
39
+ @tmy = 0
40
+ @tmyoffset = y.to_f
41
+ end
42
+ def set_yscale(yscale)
43
+ @tmyscale = (yscale.to_f * USER_SPACE).round.to_f / USER_SPACE
44
+ end
45
+ def transformation_matrix=(tm)
46
+ ### This shouldn't happen, but we do have an example of
47
+ ### it happening in
48
+ ### /Producer (Hyf PDF Output Library 2.2.3 \(Windows\))
49
+ ### /Producer (Mac OS X 10.4.6 Quartz PDFContext)
50
+ @cmxscale = (tm[0,0] * USER_SPACE).round.to_f / USER_SPACE
51
+ @cmalpha = tm[0,1]
52
+ @cmbeta = tm[1,0]
53
+ @cmyscale = (tm[1,1] * USER_SPACE).round.to_f / USER_SPACE
54
+ @cmxoffset = tm[2,0]
55
+ @cmyoffset = tm[2,1]
56
+ end
57
+ def whitespace_overlap?(previous)
58
+ false
59
+ end
60
+ def update!(rotation=0)
61
+ orientation = (rotation.to_f.round / 90) % 2
62
+ x, y, x2, y2, bx, by = nil
63
+ if orientation == 1
64
+ x = @tmxoffset + @tmy * @tmalpha
65
+ y = @tmyoffset + (@tmx + @dtmx) * @tmbeta
66
+ x2 = bx = x + @font_size * @tmalpha
67
+ y2 = y + @w * @tmbeta
68
+ by = y + @boxwidth * @tmbeta
69
+ @x = y + @cmxoffset
70
+ @y = x + @cmyoffset
71
+ @x2 = y2 + @cmxoffset
72
+ @y2 = x2 + @cmyoffset
73
+ @right_edge = by + @cmxoffset
74
+ else
75
+ x = @tmxoffset + (@tmx + @dtmx) * @tmxscale
76
+ y = @tmyoffset - @tmy * @tmyscale
77
+ x2 = x + @w * @tmxscale
78
+ y2 = by = y - @font_size * @tmyscale
79
+ bx = x + @boxwidth * @tmxscale
80
+ @x = x + @cmxoffset
81
+ @y = y + @cmyoffset
82
+ @x2 = x2 + @cmxoffset
83
+ @y2 = y2 + @cmyoffset
84
+ @right_edge = bx + @cmxoffset
85
+ end
86
+ end
87
+ def <=> (other)
88
+ if(same_line(other))
89
+ @x <=> other.x
90
+ elsif(other.is_a?(self.class))
91
+ # @cmyscale may be negative, reversing the sort-order
92
+ (@y <=> other.y) \
93
+ * (@cmyscale == 0 ? 1 : @cmyscale)
94
+ else
95
+ @y <=> other.y
96
+ end
97
+ end
98
+ end
99
+ class TextState < PositionedElement
100
+ UTF = /utf/in
101
+ attr_accessor :font, :txt
102
+ include Comparable
103
+ attr_reader :y, :x, :x2, :y2, :w, :boxwidth, :xscale, :font_size, :yscale,
104
+ :right_edge
105
+ def initialize(target_encoding='utf8')
106
+ @boxwidth = 0
107
+ @x = @tmx = @dtmx = @tmxoffset = @cmxoffset = 0.0
108
+ @y = @tmy = @tmyoffset = @cmyoffset = 0.0
109
+ @w = 0.0
110
+ @tmalpha = @cmalpha = 0.0
111
+ @tmbeta = @cmbeta = 0.0
112
+ @tmxscale = @cmxscale = 1.0
113
+ @tmyscale = @cmyscale = 1.0
114
+ @lead = nil
115
+ @font = nil
116
+ @font_size = 1
117
+ @char_spacing = 0
118
+ @word_spacing = 0
119
+ @target_encoding = target_encoding + '//TRANSLIT//IGNORE'
120
+ self.transformation_matrix = Matrix[[1,0,0],[0,1,0],[0,0,1]]
121
+ end
122
+ def advance_x(kerning = 0)
123
+ @dtmx += @w - kerning/USER_SPACE
124
+ end
125
+ def char_width(char)
126
+ if(char.is_a? String)
127
+ char = char[0]
128
+ end
129
+ w = 0.0
130
+ if(@font && (width = @font.width(char)))
131
+ w = width
132
+ elsif(@font && (avg = @font.attributes[:avgwidth]))
133
+ w = avg
134
+ end
135
+ w = 300.0 if w == 0
136
+ w += @char_spacing
137
+ if(char==32)
138
+ w += @word_spacing
139
+ end
140
+ w * @font_size / USER_SPACE
141
+ end
142
+ def whitespace_overlap?(previous)
143
+ previous && empty? && same_line(previous) \
144
+ && previous.x2 >= (@x + (@x2 - @x) / 2)
145
+ end
146
+ def send_content(previous, callback_handler)
147
+ if(previous)
148
+ if(previous.font != @font)
149
+ callback_handler.new_font(@font)
150
+ end
151
+ if(previous.font_height != self.font_height)
152
+ callback_handler.new_fontsize(self.font_height)
153
+ end
154
+ else
155
+ callback_handler.new_font(@font)
156
+ end
157
+ callback_handler.send_flowing_data(@txt)
158
+ end
159
+ def space_width
160
+ w = 300.0
161
+ if(@font && (width = @font.width(32)))
162
+ w = width
163
+ elsif(@font && (avg = @font.attributes[:avgwidth]))
164
+ w = avg
165
+ end
166
+ w += @char_spacing
167
+ w * @font_size / USER_SPACE
168
+ end
169
+ def fire_early_callbacks(previous, callback_handler)
170
+ if(previous)
171
+ if(!same_line(previous))
172
+ callback_handler.send_line_break
173
+ elsif(!same_word(previous))
174
+ if(spaces = previous.count_spaces(@x - previous.x2))
175
+ callback_handler.send_flowing_data(' '*spaces.abs)
176
+ end
177
+ end
178
+ if(new_paragraph(previous))
179
+ callback_handler.send_paragraph
180
+ end
181
+ end
182
+ end
183
+ def count_lines(displacement)
184
+ (displacement / lead).abs.ceil
185
+ rescue
186
+ 1
187
+ end
188
+ def count_spaces(displacement)
189
+ x = space_width * @tmxscale + @font_size * @tmalpha
190
+ y = @font_size * @tmyscale + space_width * @tmbeta
191
+ width = x * @cmxscale + y * @cmalpha
192
+ if(width.nonzero? && displacement > width)
193
+ (displacement / width).round
194
+ end
195
+ rescue ZeroDivisionError
196
+ warn "Ignoring Division by Zero: #{displacement.inspect}/#{width.inspect}"
197
+ end
198
+ def empty?
199
+ @txt.nil? || @txt.strip.empty?
200
+ end
201
+ def lead
202
+ @lead || -font_height * 1.2
203
+ end
204
+ def font_height
205
+ @font_size
206
+ end
207
+ def new_paragraph(last_text_state)
208
+ return false if(last_text_state.font_size.nil?)
209
+ #1.5 is an approximate value
210
+ spacing = last_text_state.font_height * 1.5
211
+ last_y = last_text_state.y
212
+ ((last_y - @y).abs > spacing.abs)
213
+ end
214
+ def recode_txt(txt)
215
+ enc = @font.encoding
216
+ if(enc.is_a?(Encoding))
217
+ # it would certainly be nice to do without all this iconving,
218
+ # but since CMaps always contain utf8, and using utf16 in
219
+ # Symbol.from_* is so much more practical than dealing with
220
+ # variable-length utf8 encoding for the characters in the
221
+ # Symbol font, we'll leave it at dtsttcpw for the moment.
222
+ if(@font.symbol?)
223
+ txt = enc.convert_symbol(txt)
224
+ if(UTF.match(@target_encoding))
225
+ @utf16_iconv ||= Iconv.new(@target_encoding.to_s, 'utf16be')
226
+ txt = @utf16_iconv.iconv(Symbol.to_utf16(txt))
227
+ end
228
+ elsif(tu = @font.to_unicode)
229
+ txt = tu.to_utf8(txt)
230
+ if(UTF.match(@target_encoding))
231
+ @utf8_iconv ||= Iconv.new(@target_encoding.to_s, 'utf8')
232
+ txt = @utf8_iconv.iconv(txt)
233
+ else
234
+ @symbol_iconv ||= Iconv.new('utf16be', 'utf8')
235
+ txt = Symbol.from_utf16(@symbol_iconv.iconv(txt))
236
+ end
237
+ end
238
+ txt
239
+ # FIXME: fix how encodings and Symbol font are handled
240
+ elsif(UTF.match(enc) && !UTF.match(@target_encoding) && @font.symbol?)
241
+ @symbol_iconv ||= Iconv.new('utf16be', 'utf8')
242
+ txt = Symbol.from_utf16(@symbol_iconv.iconv(txt))
243
+ else
244
+ @iconv ||= Iconv.new(@target_encoding.to_s, enc.to_s)
245
+ @iconv.iconv(txt)
246
+ end
247
+ rescue NoMethodError, Iconv::InvalidEncoding, Iconv::IllegalSequence => e
248
+ txt
249
+ end
250
+ def same_column(other)
251
+ return false unless same_line(other)
252
+ if(other.is_a?(TextState))
253
+ testwidth = other.space_width * 2.0
254
+ width = @x - other.right_edge
255
+ width < testwidth
256
+ else
257
+ false
258
+ end
259
+ end
260
+ def same_line(other)
261
+ if(other.is_a?(TextState))
262
+ sy1, sy2 = [@y, self.y2].sort
263
+ oy1, oy2 = [other.y, other.y2].sort
264
+
265
+ pair = [[sy1, sy2], [oy1, oy2]].sort
266
+ overlap = pair[0][1] - pair[1][0]
267
+
268
+ [sy2 - sy1, oy2 - oy1].any? { |height|
269
+ # negative overlap means the lines don't touch
270
+ overlap / height > 0.4
271
+ }
272
+ else
273
+ false
274
+ end
275
+ end
276
+ def same_word(other)
277
+ return false unless same_line(other)
278
+ if(other.is_a?(TextState))
279
+ testwidth = other.space_width / 2.0
280
+ width = @x - other.x2
281
+ width < testwidth
282
+ else
283
+ false
284
+ end
285
+ end
286
+ def set_font(font)
287
+ @iconv = nil
288
+ @font = font
289
+ end
290
+ def set_font_size(size)
291
+ @font_size = size.to_f
292
+ end
293
+ def set_lead(lead)
294
+ @lead = lead.to_f
295
+ end
296
+ def set_char_spacing(line)
297
+ @char_spacing = line.to_f * USER_SPACE
298
+ end
299
+ def set_txt(txt)
300
+ #call the unescape_txt method,
301
+ #so that \334 is replaced by char �
302
+ #otherwise the calculation of the string width is wrong!!!!
303
+ unescape_txt!(txt)
304
+ @boxwidth = 0
305
+ txt.rstrip.each_byte do |char|
306
+ @boxwidth += char_width(char)
307
+ end
308
+ @w = @boxwidth
309
+ if white = txt[/\s+$/u]
310
+ white.each_byte do |char|
311
+ @w += char_width(char)
312
+ end
313
+ end
314
+ @txt = recode_txt(txt)
315
+ end
316
+ def set_word_spacing(word_spacing)
317
+ @word_spacing = word_spacing.to_f * USER_SPACE
318
+ end
319
+ def step
320
+ @dtmx = 0
321
+ @tmy -= lead
322
+ end
323
+ def update_x(x_val)
324
+ @dtmx = 0
325
+ @tmx += x_val.to_f
326
+ end
327
+ def update_y(y_val)
328
+ @dtmx = 0
329
+ @tmy -= y_val.to_f
330
+ end
331
+ def unescape_txt!(txt)
332
+ txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr }
333
+ end
334
+ protected
335
+ attr_writer :x
336
+ end
337
+ class NontextElement < PositionedElement
338
+ attr_accessor :current_page
339
+ attr_reader :x, :y, :x2, :y2, :text_state
340
+ def initialize
341
+ @x = @matrix_x = 0.0
342
+ @y = 0.0
343
+ @w = 0.0
344
+ @cmxscale = @tmxscale = 1.0
345
+ @cmyscale = @tmyscale = 1.0
346
+ @xscale = 1.0
347
+ @yscale = 1.0
348
+ @space_width = -1
349
+ super
350
+ end
351
+ def empty?
352
+ false
353
+ end
354
+ def fire_early_callbacks(previous, callback_handler)
355
+ if(previous)
356
+ unless(same_line(previous))
357
+ callback_handler.send_line_break
358
+ end
359
+ if(@font && previous.font != @font)
360
+ callback_handler.new_font(@font)
361
+ end
362
+ end
363
+ end
364
+ def same_column(other)
365
+ false
366
+ end
367
+ def text_state=(ts)
368
+ @media_box = ts.media_box
369
+ @font = ts.font
370
+ @text_state = ts.dup
371
+ end
372
+ def method_missing(name, *args, &block)
373
+ @text_state.send(name, *args, &block)
374
+ end
375
+ end
376
+ class HorizontalRule < NontextElement
377
+ alias :x2 :x
378
+ alias :y2 :y
379
+ def initialize(x, y, dm)
380
+ super()
381
+ self.transformation_matrix = dm
382
+ set_x(x)
383
+ set_y(y)
384
+ end
385
+ def send_content(previous, callback_handler)
386
+ if(previous && !same_line(previous))
387
+ callback_handler.send_hr
388
+ end
389
+ end
390
+ def same_line(other)
391
+ if(other.is_a?(HorizontalRule))
392
+ (other.y - @y).abs < 10
393
+ else
394
+ false
395
+ end
396
+ end
397
+ def <=> other
398
+ if other.is_a?(HorizontalRule) && same_line(other)
399
+ 0
400
+ else
401
+ super
402
+ end
403
+ end
404
+ end
405
+ class ImagePlacement < NontextElement
406
+ attr_reader :resource
407
+ def initialize(resource, x, y, dm)
408
+ super()
409
+ case resource
410
+ when InlineImage
411
+ @xobject = resource
412
+ else
413
+ @resource = resource.downcase[1..-1].to_sym
414
+ end
415
+ self.transformation_matrix = dm
416
+ @x = x
417
+ @y = y - @cmyscale / 2
418
+ end
419
+ def image
420
+ xobject.image
421
+ end
422
+ def same_line(other)
423
+ @y == other.y
424
+ end
425
+ def send_content(previous, callback_handler)
426
+ if img = xobject
427
+ callback_handler.send_image self
428
+ end
429
+ end
430
+ def xobject
431
+ @xobject ||= @current_page.resources.xobject(@resource)
432
+ end
433
+ end
434
+ end