rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,182 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # Text -- Rpdf2txt -- 28.11.2002 -- aschrafl@ywesee.com
24
+
25
+ require 'rpdf2txt/text_state'
26
+ require 'rpdf2txt/textparser'
27
+ require 'rpdf2txt/object'
28
+ require 'iconv'
29
+
30
+ module Rpdf2txt
31
+ class Text
32
+ attr_writer :current_page
33
+ attr_reader :text_state, :transformation_matrix
34
+ def initialize(src, target_encoding='utf8',
35
+ tm=Matrix[[1,0,0],[0,-1,0],[0,0,1]])
36
+ @src = src.gsub(/[\r\n]+/n, "\n")
37
+ @text_state = TextState.new(target_encoding)
38
+ @transformation_matrix = tm
39
+ @text_state.transformation_matrix = tm
40
+ end
41
+ ## FIXME: generic_symbol_font is a workaround. Implement a way to
42
+ ## pass unicode-snippets (or rework everything to unicode)
43
+ def generic_symbol_font(font)
44
+ if(font.nil?)
45
+ Font.new('<< /BaseFont /Symbol')
46
+ elsif(/symbol/in.match(font.basefont_name))
47
+ font
48
+ else
49
+ genfont = font.dup
50
+ genfont.attributes[:basefont] = 'Symbol'
51
+ genfont
52
+ end
53
+ end
54
+ def get_font(font_name)
55
+ return nil unless @current_page
56
+ @current_page.font(font_name.to_s.downcase.intern)
57
+ end
58
+ def mapped_ascii(ascii)
59
+ if(@current_font)
60
+ if((cmap = @current_font.cmap) && (map = cmap.map) \
61
+ && (unicode_bytes = map[ascii]) \
62
+ && (ascii = SymbolMap::SYMBOL_ENTITIES[unicode_bytes]))
63
+ ascii.chr
64
+ elsif((map = @current_font.to_unicode) \
65
+ && (utf8 = map.to_utf8(ascii)))
66
+ @current_font.attributes[:encoding] = '/UTF8'
67
+ #@text_state.set_font(@current_font)
68
+ [utf8].pack('U')
69
+ end
70
+ end
71
+ end
72
+ def scan
73
+ @snippets = []
74
+ ast = Rpdf2txt.text_parser.parse(@src)
75
+ scan_tree(ast)
76
+ @snippets
77
+ rescue Exception
78
+ puts @src
79
+ raise
80
+ end
81
+ def scan_tree(ast)
82
+ ast.values.each { |node|
83
+ if(node.name == 'Array') \
84
+ && (node.values.first.children_names.first == 'kerning')
85
+ ## If the case [ 34 (foo) ] crops up, the first operation
86
+ ## executed on @text_state is advance_x. This results in
87
+ ## the width of the last text-snipped being calculated twice.
88
+ ## This here is a workaround that resets the snippet to an
89
+ ## empty string if we are encountering a [ ??? ] construct
90
+ ## (an array).
91
+ ## TODO: find a more general solution
92
+ @text_state.set_txt('')
93
+ end
94
+ node.children_names.each { |child_name|
95
+ case child_name
96
+ when 'alpha'
97
+ @text_state.tmalpha = node.alpha.value.to_f
98
+ when 'beta'
99
+ @text_state.tmbeta = -node.beta.value.to_f
100
+ skew = node.beta.value.to_f > 0.1
101
+ if(@current_font && @current_font.skewed != skew)
102
+ @current_font = @current_font.dup
103
+ @current_font.skewed = skew
104
+ @text_state.set_font(@current_font)
105
+ end
106
+ when 'xscale'
107
+ @text_state.set_xscale(node.xscale.value)
108
+ when 'yscale'
109
+ @text_state.set_yscale(node.yscale.value)
110
+ when 'charspace'
111
+ @text_state.set_char_spacing(node.charspace.value)
112
+ when 'kerning'
113
+ @text_state.advance_x(node.kerning.value.to_f)
114
+ when 'tdleadx'
115
+ @text_state.update_x(node.tdleadx.value.to_f)
116
+ when 'tdleady'
117
+ lead = node.tdleady.value.to_f
118
+ @text_state.set_lead(lead)
119
+ @text_state.update_y(lead)
120
+ when 'xpos'
121
+ @text_state.update_x(node.xpos.value.to_f)
122
+ when 'ypos'
123
+ @text_state.update_y(node.ypos.value.to_f)
124
+ when 'fontname'
125
+ @current_font = get_font(node.fontname.value)
126
+ @text_state.set_font(@current_font)
127
+ @text_state.set_font_size(node.fontsize.value)
128
+ when 'tmx'
129
+ @text_state.set_x(node.tmx.value.to_f)
130
+ when 'tmy'
131
+ @text_state.set_y(node.tmy.value)
132
+ when 'render'
133
+ val = node.render.value
134
+ if(@current_font && @current_font.rendering_mode != val)
135
+ @current_font = @current_font.dup
136
+ @current_font.rendering_mode = val
137
+ @text_state.set_font(@current_font)
138
+ end
139
+ when 'wordspace'
140
+ @text_state.set_word_spacing(node.wordspace.value)
141
+ when 'values'
142
+ scan_tree(node)
143
+ when 'snippet'
144
+ snip(node.snippet.value)
145
+ when 'aposnippet'
146
+ @text_state.step
147
+ snip(node.aposnippet.value)
148
+ when 'linebreak'
149
+ @text_state.step
150
+ when 'textrise'
151
+ #add functionality for textrise p 387 pdf manual
152
+ when 'hexsnippet'
153
+ hex_bytes = node.hexsnippet.value
154
+ char = ''
155
+ hex_bytes.scan(/.{2,4}/n) { |pair|
156
+ dec_byte = pair.hex
157
+ char << (mapped_ascii(dec_byte) || '?')
158
+ }
159
+ _snip(char)
160
+ end
161
+ }
162
+ }
163
+ end
164
+ def snip(snippet)
165
+ snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
166
+ snippet_text.gsub!(/\\([()])/n, '\1')
167
+ snippet_text.gsub!(/./n) { |char|
168
+ self.mapped_ascii(char[0]) || char
169
+ }
170
+ _snip(snippet_text)
171
+ end
172
+ def _snip(snippet_text)
173
+ @text_state.set_txt(snippet_text)
174
+ @text_state.update!(@current_page ? @current_page.attributes[:rotate] : 0)
175
+ @snippets.push(@text_state.dup).last
176
+ end
177
+ def text_state=(text_state)
178
+ text_state.transformation_matrix = @transformation_matrix
179
+ @text_state = text_state
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,434 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com
24
+
25
+ module Rpdf2txt
26
+ class PositionedElement
27
+ USER_SPACE = 1000.0
28
+ attr_accessor :media_box, :tmalpha, :tmbeta
29
+ def fire_callbacks(previous_positioned_element, callback_handler)
30
+ end
31
+ def set_x(x)
32
+ @tmx = @dtmx = 0
33
+ @tmxoffset = x.to_f
34
+ end
35
+ def set_xscale(xscale)
36
+ @tmxscale = (xscale.to_f * USER_SPACE).round.to_f / USER_SPACE
37
+ end
38
+ def set_y(y)
39
+ @tmy = 0
40
+ @tmyoffset = y.to_f
41
+ end
42
+ def set_yscale(yscale)
43
+ @tmyscale = (yscale.to_f * USER_SPACE).round.to_f / USER_SPACE
44
+ end
45
+ def transformation_matrix=(tm)
46
+ ### This shouldn't happen, but we do have an example of
47
+ ### it happening in
48
+ ### /Producer (Hyf PDF Output Library 2.2.3 \(Windows\))
49
+ ### /Producer (Mac OS X 10.4.6 Quartz PDFContext)
50
+ @cmxscale = (tm[0,0] * USER_SPACE).round.to_f / USER_SPACE
51
+ @cmalpha = tm[0,1]
52
+ @cmbeta = tm[1,0]
53
+ @cmyscale = (tm[1,1] * USER_SPACE).round.to_f / USER_SPACE
54
+ @cmxoffset = tm[2,0]
55
+ @cmyoffset = tm[2,1]
56
+ end
57
+ def whitespace_overlap?(previous)
58
+ false
59
+ end
60
+ def update!(rotation=0)
61
+ orientation = (rotation.to_f.round / 90) % 2
62
+ x, y, x2, y2, bx, by = nil
63
+ if orientation == 1
64
+ x = @tmxoffset + @tmy * @tmalpha
65
+ y = @tmyoffset + (@tmx + @dtmx) * @tmbeta
66
+ x2 = bx = x + @font_size * @tmalpha
67
+ y2 = y + @w * @tmbeta
68
+ by = y + @boxwidth * @tmbeta
69
+ @x = y + @cmxoffset
70
+ @y = x + @cmyoffset
71
+ @x2 = y2 + @cmxoffset
72
+ @y2 = x2 + @cmyoffset
73
+ @right_edge = by + @cmxoffset
74
+ else
75
+ x = @tmxoffset + (@tmx + @dtmx) * @tmxscale
76
+ y = @tmyoffset - @tmy * @tmyscale
77
+ x2 = x + @w * @tmxscale
78
+ y2 = by = y - @font_size * @tmyscale
79
+ bx = x + @boxwidth * @tmxscale
80
+ @x = x + @cmxoffset
81
+ @y = y + @cmyoffset
82
+ @x2 = x2 + @cmxoffset
83
+ @y2 = y2 + @cmyoffset
84
+ @right_edge = bx + @cmxoffset
85
+ end
86
+ end
87
+ def <=> (other)
88
+ if(same_line(other))
89
+ @x <=> other.x
90
+ elsif(other.is_a?(self.class))
91
+ # @cmyscale may be negative, reversing the sort-order
92
+ (@y <=> other.y) \
93
+ * (@cmyscale == 0 ? 1 : @cmyscale)
94
+ else
95
+ @y <=> other.y
96
+ end
97
+ end
98
+ end
99
+ class TextState < PositionedElement
100
+ UTF = /utf/in
101
+ attr_accessor :font, :txt
102
+ include Comparable
103
+ attr_reader :y, :x, :x2, :y2, :w, :boxwidth, :xscale, :font_size, :yscale,
104
+ :right_edge
105
+ def initialize(target_encoding='utf8')
106
+ @boxwidth = 0
107
+ @x = @tmx = @dtmx = @tmxoffset = @cmxoffset = 0.0
108
+ @y = @tmy = @tmyoffset = @cmyoffset = 0.0
109
+ @w = 0.0
110
+ @tmalpha = @cmalpha = 0.0
111
+ @tmbeta = @cmbeta = 0.0
112
+ @tmxscale = @cmxscale = 1.0
113
+ @tmyscale = @cmyscale = 1.0
114
+ @lead = nil
115
+ @font = nil
116
+ @font_size = 1
117
+ @char_spacing = 0
118
+ @word_spacing = 0
119
+ @target_encoding = target_encoding + '//TRANSLIT//IGNORE'
120
+ self.transformation_matrix = Matrix[[1,0,0],[0,1,0],[0,0,1]]
121
+ end
122
+ def advance_x(kerning = 0)
123
+ @dtmx += @w - kerning/USER_SPACE
124
+ end
125
+ def char_width(char)
126
+ if(char.is_a? String)
127
+ char = char[0]
128
+ end
129
+ w = 0.0
130
+ if(@font && (width = @font.width(char)))
131
+ w = width
132
+ elsif(@font && (avg = @font.attributes[:avgwidth]))
133
+ w = avg
134
+ end
135
+ w = 300.0 if w == 0
136
+ w += @char_spacing
137
+ if(char==32)
138
+ w += @word_spacing
139
+ end
140
+ w * @font_size / USER_SPACE
141
+ end
142
+ def whitespace_overlap?(previous)
143
+ previous && empty? && same_line(previous) \
144
+ && previous.x2 >= (@x + (@x2 - @x) / 2)
145
+ end
146
+ def send_content(previous, callback_handler)
147
+ if(previous)
148
+ if(previous.font != @font)
149
+ callback_handler.new_font(@font)
150
+ end
151
+ if(previous.font_height != self.font_height)
152
+ callback_handler.new_fontsize(self.font_height)
153
+ end
154
+ else
155
+ callback_handler.new_font(@font)
156
+ end
157
+ callback_handler.send_flowing_data(@txt)
158
+ end
159
+ def space_width
160
+ w = 300.0
161
+ if(@font && (width = @font.width(32)))
162
+ w = width
163
+ elsif(@font && (avg = @font.attributes[:avgwidth]))
164
+ w = avg
165
+ end
166
+ w += @char_spacing
167
+ w * @font_size / USER_SPACE
168
+ end
169
+ def fire_early_callbacks(previous, callback_handler)
170
+ if(previous)
171
+ if(!same_line(previous))
172
+ callback_handler.send_line_break
173
+ elsif(!same_word(previous))
174
+ if(spaces = previous.count_spaces(@x - previous.x2))
175
+ callback_handler.send_flowing_data(' '*spaces.abs)
176
+ end
177
+ end
178
+ if(new_paragraph(previous))
179
+ callback_handler.send_paragraph
180
+ end
181
+ end
182
+ end
183
+ def count_lines(displacement)
184
+ (displacement / lead).abs.ceil
185
+ rescue
186
+ 1
187
+ end
188
+ def count_spaces(displacement)
189
+ x = space_width * @tmxscale + @font_size * @tmalpha
190
+ y = @font_size * @tmyscale + space_width * @tmbeta
191
+ width = x * @cmxscale + y * @cmalpha
192
+ if(width.nonzero? && displacement > width)
193
+ (displacement / width).round
194
+ end
195
+ rescue ZeroDivisionError
196
+ warn "Ignoring Division by Zero: #{displacement.inspect}/#{width.inspect}"
197
+ end
198
+ def empty?
199
+ @txt.nil? || @txt.strip.empty?
200
+ end
201
+ def lead
202
+ @lead || -font_height * 1.2
203
+ end
204
+ def font_height
205
+ @font_size
206
+ end
207
+ def new_paragraph(last_text_state)
208
+ return false if(last_text_state.font_size.nil?)
209
+ #1.5 is an approximate value
210
+ spacing = last_text_state.font_height * 1.5
211
+ last_y = last_text_state.y
212
+ ((last_y - @y).abs > spacing.abs)
213
+ end
214
+ def recode_txt(txt)
215
+ enc = @font.encoding
216
+ if(enc.is_a?(Encoding))
217
+ # it would certainly be nice to do without all this iconving,
218
+ # but since CMaps always contain utf8, and using utf16 in
219
+ # Symbol.from_* is so much more practical than dealing with
220
+ # variable-length utf8 encoding for the characters in the
221
+ # Symbol font, we'll leave it at dtsttcpw for the moment.
222
+ if(@font.symbol?)
223
+ txt = enc.convert_symbol(txt)
224
+ if(UTF.match(@target_encoding))
225
+ @utf16_iconv ||= Iconv.new(@target_encoding.to_s, 'utf16be')
226
+ txt = @utf16_iconv.iconv(Symbol.to_utf16(txt))
227
+ end
228
+ elsif(tu = @font.to_unicode)
229
+ txt = tu.to_utf8(txt)
230
+ if(UTF.match(@target_encoding))
231
+ @utf8_iconv ||= Iconv.new(@target_encoding.to_s, 'utf8')
232
+ txt = @utf8_iconv.iconv(txt)
233
+ else
234
+ @symbol_iconv ||= Iconv.new('utf16be', 'utf8')
235
+ txt = Symbol.from_utf16(@symbol_iconv.iconv(txt))
236
+ end
237
+ end
238
+ txt
239
+ # FIXME: fix how encodings and Symbol font are handled
240
+ elsif(UTF.match(enc) && !UTF.match(@target_encoding) && @font.symbol?)
241
+ @symbol_iconv ||= Iconv.new('utf16be', 'utf8')
242
+ txt = Symbol.from_utf16(@symbol_iconv.iconv(txt))
243
+ else
244
+ @iconv ||= Iconv.new(@target_encoding.to_s, enc.to_s)
245
+ @iconv.iconv(txt)
246
+ end
247
+ rescue NoMethodError, Iconv::InvalidEncoding, Iconv::IllegalSequence => e
248
+ txt
249
+ end
250
+ def same_column(other)
251
+ return false unless same_line(other)
252
+ if(other.is_a?(TextState))
253
+ testwidth = other.space_width * 2.0
254
+ width = @x - other.right_edge
255
+ width < testwidth
256
+ else
257
+ false
258
+ end
259
+ end
260
+ def same_line(other)
261
+ if(other.is_a?(TextState))
262
+ sy1, sy2 = [@y, self.y2].sort
263
+ oy1, oy2 = [other.y, other.y2].sort
264
+
265
+ pair = [[sy1, sy2], [oy1, oy2]].sort
266
+ overlap = pair[0][1] - pair[1][0]
267
+
268
+ [sy2 - sy1, oy2 - oy1].any? { |height|
269
+ # negative overlap means the lines don't touch
270
+ overlap / height > 0.4
271
+ }
272
+ else
273
+ false
274
+ end
275
+ end
276
+ def same_word(other)
277
+ return false unless same_line(other)
278
+ if(other.is_a?(TextState))
279
+ testwidth = other.space_width / 2.0
280
+ width = @x - other.x2
281
+ width < testwidth
282
+ else
283
+ false
284
+ end
285
+ end
286
+ def set_font(font)
287
+ @iconv = nil
288
+ @font = font
289
+ end
290
+ def set_font_size(size)
291
+ @font_size = size.to_f
292
+ end
293
+ def set_lead(lead)
294
+ @lead = lead.to_f
295
+ end
296
+ def set_char_spacing(line)
297
+ @char_spacing = line.to_f * USER_SPACE
298
+ end
299
+ def set_txt(txt)
300
+ #call the unescape_txt method,
301
+ #so that \334 is replaced by char �
302
+ #otherwise the calculation of the string width is wrong!!!!
303
+ unescape_txt!(txt)
304
+ @boxwidth = 0
305
+ txt.rstrip.each_byte do |char|
306
+ @boxwidth += char_width(char)
307
+ end
308
+ @w = @boxwidth
309
+ if white = txt[/\s+$/u]
310
+ white.each_byte do |char|
311
+ @w += char_width(char)
312
+ end
313
+ end
314
+ @txt = recode_txt(txt)
315
+ end
316
+ def set_word_spacing(word_spacing)
317
+ @word_spacing = word_spacing.to_f * USER_SPACE
318
+ end
319
+ def step
320
+ @dtmx = 0
321
+ @tmy -= lead
322
+ end
323
+ def update_x(x_val)
324
+ @dtmx = 0
325
+ @tmx += x_val.to_f
326
+ end
327
+ def update_y(y_val)
328
+ @dtmx = 0
329
+ @tmy -= y_val.to_f
330
+ end
331
+ def unescape_txt!(txt)
332
+ txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr }
333
+ end
334
+ protected
335
+ attr_writer :x
336
+ end
337
+ class NontextElement < PositionedElement
338
+ attr_accessor :current_page
339
+ attr_reader :x, :y, :x2, :y2, :text_state
340
+ def initialize
341
+ @x = @matrix_x = 0.0
342
+ @y = 0.0
343
+ @w = 0.0
344
+ @cmxscale = @tmxscale = 1.0
345
+ @cmyscale = @tmyscale = 1.0
346
+ @xscale = 1.0
347
+ @yscale = 1.0
348
+ @space_width = -1
349
+ super
350
+ end
351
+ def empty?
352
+ false
353
+ end
354
+ def fire_early_callbacks(previous, callback_handler)
355
+ if(previous)
356
+ unless(same_line(previous))
357
+ callback_handler.send_line_break
358
+ end
359
+ if(@font && previous.font != @font)
360
+ callback_handler.new_font(@font)
361
+ end
362
+ end
363
+ end
364
+ def same_column(other)
365
+ false
366
+ end
367
+ def text_state=(ts)
368
+ @media_box = ts.media_box
369
+ @font = ts.font
370
+ @text_state = ts.dup
371
+ end
372
+ def method_missing(name, *args, &block)
373
+ @text_state.send(name, *args, &block)
374
+ end
375
+ end
376
+ class HorizontalRule < NontextElement
377
+ alias :x2 :x
378
+ alias :y2 :y
379
+ def initialize(x, y, dm)
380
+ super()
381
+ self.transformation_matrix = dm
382
+ set_x(x)
383
+ set_y(y)
384
+ end
385
+ def send_content(previous, callback_handler)
386
+ if(previous && !same_line(previous))
387
+ callback_handler.send_hr
388
+ end
389
+ end
390
+ def same_line(other)
391
+ if(other.is_a?(HorizontalRule))
392
+ (other.y - @y).abs < 10
393
+ else
394
+ false
395
+ end
396
+ end
397
+ def <=> other
398
+ if other.is_a?(HorizontalRule) && same_line(other)
399
+ 0
400
+ else
401
+ super
402
+ end
403
+ end
404
+ end
405
+ class ImagePlacement < NontextElement
406
+ attr_reader :resource
407
+ def initialize(resource, x, y, dm)
408
+ super()
409
+ case resource
410
+ when InlineImage
411
+ @xobject = resource
412
+ else
413
+ @resource = resource.downcase[1..-1].to_sym
414
+ end
415
+ self.transformation_matrix = dm
416
+ @x = x
417
+ @y = y - @cmyscale / 2
418
+ end
419
+ def image
420
+ xobject.image
421
+ end
422
+ def same_line(other)
423
+ @y == other.y
424
+ end
425
+ def send_content(previous, callback_handler)
426
+ if img = xobject
427
+ callback_handler.send_image self
428
+ end
429
+ end
430
+ def xobject
431
+ @xobject ||= @current_page.resources.xobject(@resource)
432
+ end
433
+ end
434
+ end