rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env ruby
2
+ # TestSpaceBug052004 -- rpdf2txt -- 28.05.2004 -- hwyss@ywesee.com
3
+
4
+ $: << File.dirname(__FILE__)
5
+ $: << File.expand_path('../lib', File.dirname(__FILE__))
6
+
7
+ require 'test/unit'
8
+ require 'rpdf2txt/parser'
9
+
10
+ module Rpdf2txt
11
+ class TestSpaceBug052004 < Test::Unit::TestCase
12
+ class FontDonor
13
+ attr_accessor :fonts, :attributes
14
+ def initialize
15
+ @attributes = {}
16
+ end
17
+ def font(key)
18
+ @fonts[key]
19
+ end
20
+ end
21
+ def test_same_word
22
+ font3_src = <<-EOS
23
+ 7 0 obj
24
+ <<
25
+ /Type /Font
26
+ /Subtype /Type1
27
+ /FirstChar 32
28
+ /LastChar 240
29
+ /Widths [ 278 389 556 556 556 1000 722 278 333 333 556 600 278 333 278 278
30
+ 556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
31
+ 800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
32
+ 556 778 611 500 556 722 667 1000 667 667 556 333 278 333 600 500
33
+ 278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
34
+ 611 611 389 389 389 611 500 833 500 500 500 333 222 333 600 278
35
+ 722 0 0 0 0 0 722 0 556 556 556 0 0 0 556 556 556 0 0 0 278 278
36
+ 0 0 0 611 611 0 0 611 611 611 0 400 556 556 0 0 0 0 0 800 0 0 0
37
+ 278 0 0 278 600 278 278 0 611 278 278 278 278 278 0 0 278 0 0 0
38
+ 0 0 278 0 278 278 556 556 0 278 0 0 0 0 0 500 0 556 0 0 278 0 278
39
+ 0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 278 ]
40
+ /Encoding /MacRomanEncoding
41
+ /BaseFont /Frutiger-Roman
42
+ /FontDescriptor 381 0 R
43
+ >>
44
+ endobj
45
+ EOS
46
+ font30_src = <<-EOS
47
+ 74 0 obj
48
+ <<
49
+ /Type /Font
50
+ /Subtype /Type1
51
+ /FirstChar 32
52
+ /LastChar 181
53
+ /Widths [ 278 389 556 556 556 1000 722 278 333 333 556 600 278 333 278 278
54
+ 556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
55
+ 800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
56
+ 556 778 611 500 556 722 667 1000 667 667 556 333 278 333 600 500
57
+ 278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
58
+ 611 611 389 389 389 611 500 833 500 500 500 333 222 333 600 278
59
+ 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
60
+ 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
61
+ 278 278 556 556 278 278 278 278 278 800 278 278 278 278 278 278
62
+ 278 600 278 278 278 611 ]
63
+ /Encoding /WinAnsiEncoding
64
+ /BaseFont /Frutiger-Roman
65
+ /FontDescriptor 381 0 R
66
+ >>
67
+ endobj
68
+ EOS
69
+ font3 = Font.new(font3_src) # MacRoman Encoded, used for umlauts
70
+ font30 = Font.new(font30_src) # WinAnsi Encoded
71
+ path = File.expand_path('data/space_bug_stream.txt',
72
+ File.dirname(__FILE__))
73
+ stream = Stream.new(File.read(path))
74
+ page = FontDonor.new
75
+ page.fonts = {
76
+ :f3 => font3,
77
+ :f30 => font30,
78
+ }
79
+ snippets = []
80
+ snippets = stream.extract_text_objects(page, TextState.new)
81
+ sorted = snippets.sort
82
+ g_char = sorted[-4]
83
+ uml_char = sorted[-3]
84
+ assert_equal(true, uml_char.same_word(g_char))
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env ruby
2
+ # TestStream -- rpdf2txt -- 01.06.2005 -- hwyss@ywesee.com
3
+
4
+ $: << File.expand_path('../lib', File.dirname(__FILE__))
5
+
6
+ require 'test/unit'
7
+ require 'rpdf2txt/object'
8
+
9
+ module Rpdf2txt
10
+ class Stream < PdfObject
11
+ attr_writer :decoded_stream, :raw_stream
12
+ end
13
+ class TestStream < Test::Unit::TestCase
14
+ def setup
15
+ input = "3 0 obj << /Length 12 >> stream\n (Hello World!) endstream endobj"
16
+ @stream = Rpdf2txt::Stream.new(input)
17
+ end
18
+ def test_oneliner
19
+ src = <<-EOS
20
+ 1 0 0 1 -0.5 -0.5 cm q 455.043 -1.5 m 455.043 12.1 l 452.043 9.1 l 452.043 1.5 l
21
+ W* n 453.543 -0.5 m 453.543 11.1 l S Q q 455.043 12.1 m -1.5 12.1 l 1.5 9.1 l 452.043
22
+ 9.1 l W* n 454.043 10.6 m -0.5 10.6 l S Q q -1.5 12.1 m -1.5 -1.5 l 1.5 1.5 l 1.5
23
+ 9.1 l W* n 0 11.1 m 0 -0.5 l S Q 1 0 0 1 0 39.866 cm
24
+ BT /F1 16 Tf 1 0 0 -1 0 14.347 Tm(Ponstan�) Tj 0 0 0 RG ET
25
+ EOS
26
+ stream = Stream.new('')
27
+ stream.instance_variable_set('@decoded_stream', src)
28
+ snippets = stream.extract_text_objects(nil, TextState.new)
29
+ assert_equal(4, snippets.size, "no text snippets were found")
30
+ assert_instance_of(TextState, snippets.last)
31
+ end
32
+ def test_decode_raw_stream
33
+ file = File.expand_path('./data/firststream',
34
+ File.dirname(__FILE__))
35
+ deflated = File.read(file)
36
+ src = "stream\n#{deflated}endstream"
37
+ stream = Rpdf2txt::Stream.new(src)
38
+ stream.attributes.store(:filter, '/FlateDecode')
39
+ file = File.expand_path('./data/test.txt',
40
+ File.dirname(__FILE__))
41
+ expected = File.read(file)
42
+ assert_equal(expected, stream.decode_raw_stream)
43
+ end
44
+ def test_raw_stream
45
+ expected = ' (Hello World!) '
46
+ assert_equal(expected, @stream.raw_stream)
47
+ end
48
+ def test_decoded_stream
49
+ expected = ' (Hello World!) '
50
+ assert_equal(expected, @stream.decoded_stream)
51
+ end
52
+ def test_decoded_stream2
53
+ file = File.expand_path('./data/firststream', File.dirname(__FILE__))
54
+ @stream.raw_stream = File.read(file)
55
+ @stream.attributes[:filter] = "/FlateDecode"
56
+ file = File.expand_path('./data/test.txt', File.dirname(__FILE__))
57
+ expected = File.read(file)
58
+ assert_equal(expected, @stream.decoded_stream)
59
+ end
60
+ def test_extract_text_objects
61
+ file = File.expand_path('./data/stream.txt', File.dirname(__FILE__))
62
+ @stream.decoded_stream = File.read(file)
63
+ result = @stream.extract_text_objects(nil, TextState.new).select { |res|
64
+ res.is_a?(TextState)
65
+ }
66
+ assert_instance_of(Array, result)
67
+ assert_equal(69, result.size)
68
+ end
69
+ def test_extract_text_objects__artifact
70
+ @stream.decoded_stream = <<-EOS
71
+ BT
72
+ /P <</MCID 0 >>BDC
73
+ /CS0 cs 1 0 0 scn
74
+ /TT2 1 Tf
75
+ 7.98 0 0 7.98 42.54 718.7603 Tm
76
+ ( )Tj
77
+ /TT3 1 Tf
78
+ 6.0646 Tc 10.02 0 0 10.02 42.54 21.6803 Tm
79
+ ( )Tj
80
+ 22.635 0 Td
81
+ ( )Tj
82
+ 22.635 0 Td
83
+ ( )Tj
84
+ 0 g
85
+ /TT2 1 Tf
86
+ 0 Tc 0.0004 Tw 7.98 0 0 7.98 516.84 21.6803 Tm
87
+ (- Seite 1 / 32 - )Tj
88
+ EMC
89
+ /Artifact <</Type /Printing >>BDC
90
+ /TT0 1 Tf
91
+ 0 Tw -59.436 5.406 Td
92
+ (1)Tj
93
+ -0.0007 Tc 0.0049 Tw 7.02 0 0 7.02 46.98 59.3003 Tm
94
+ [(Siehe Legende a)8(u)3(f der let)8(z)-5(ten)12( Se)8(it)8(e des V)8(erze)8(i)-5(chnis)8(ses )]TJ
95
+ 0.0005 Tc 0 Tw -0.632 -1.299 Td
96
+ [(29.09)10(.09 )]TJ
97
+ /TT1 1 Tf
98
+ 0 Tc 7.98 0 0 7.98 42.54 41.2403 Tm
99
+ ( )Tj
100
+ /TT2 1 Tf
101
+ 9 0 0 9 42.54 31.0403 Tm
102
+ ( )Tj
103
+ EMC
104
+ ET
105
+ EOS
106
+ result = nil
107
+ assert_nothing_raised do
108
+ result = @stream.extract_text_objects(nil, TextState.new).select { |res|
109
+ res.is_a?(TextState)
110
+ }
111
+ end
112
+ assert_instance_of(Array, result)
113
+ assert_equal(22, result.size)
114
+ end
115
+ def test_extract_nontext_objects
116
+ src = <<-EOS
117
+ q Q q 12 12.240058 587.76001 767.76001 re W n /Cs1 cs 0 0 0 sc q 0.23999999
118
+ 0 0 0.23999999 64.800003 38.880058 cm
119
+ EOS
120
+ active = Matrix[[1,0,0], [0,1,0], [0,0,1]]
121
+ stack = [active]
122
+ expected = Matrix[[0.23999999, 0, 0],
123
+ [0, 0.23999999, 0],
124
+ [64.800003, 38.880058, 1]]
125
+ tmatrix = @stream.extract_nontext_objects(src, active, stack, [])
126
+ assert_equal(expected, tmatrix)
127
+ #assert_equal([0.23999999, 0, 0, 0.23999999, 64.800003, 38.880058], tmatrix)
128
+ end
129
+ def test_robust_et
130
+ src = <<-'EOS'
131
+ BT
132
+ 10 0 0 10 113.0394 341.8156 Tm
133
+ -0.0002 Tc
134
+ 0.0000 Tw
135
+ [(Zul.-Nr)91.6(.: )]TJ
136
+ /F10 1 Tf
137
+ 3.8771 0 TD
138
+ (55994)Tj
139
+ /F3 1 Tf
140
+ 8.8787 0 TD
141
+ -0.0001 Tc
142
+ [(V)36.8(erkaufskategorie: )]TJ
143
+ /F10 1 Tf
144
+ 9.1015 0 TD
145
+ 0.0000 Tc
146
+ (B)Tj
147
+ /F3 1 Tf
148
+ 3.6544 0 TD
149
+ -0.0001 Tc
150
+ -0.0306 Tw
151
+ [(Index: 20.01.0.)-9563.5(20.03.2003)]TJ
152
+ -25.5118 -2.2428 TD
153
+ 0.0000 Tw
154
+ [(Zusammensetzung:)-921.1(01)]TJ
155
+ 8.3 0 0 8.3 226.4252 319.3875 Tm
156
+ -0.0603 Tw
157
+ [(BARII CARBONAS D12 8.8)-139.1(%, CALCII IODIDUM D4 14.5)-139.1(%, CISTUS CANADE)]TJ
158
+ 33.6765 0 TD
159
+ [(NSIS D4 16.5)-139.1(%, CO-)]TJ
160
+ -33.6765 -1.3661 TD
161
+ -0.0203 Tw
162
+ [(NIUM MACULA)73.7(TUM D8 14.7)-139.1(%, ECHINACEA ANGUSTIFOLIA ET \(AUT\) P)36.7(ALLID)]TJ
163
+ 35.2499 0 TD
164
+ (A spag. Peka D5)Tj
165
+ -35.2499 -1.3661 TD
166
+ -0.1016 Tw
167
+ [(16.5)-139.1(%, JUGLANS REGIA spag. Peka D4 14.5)-139.1(%, SCROPHULARIA NODOSA D)]TJ
168
+ 33.6965 0 TD
169
+ [(4 14.5)-139.1(%, EXCIPIENS)]TJ
170
+ -33.6965 -1.3661 TD
171
+ -0.0306 Tw
172
+ (ad GLOBULOS.)Tj
173
+ 10 0 0 10 113.0394 271.1969 Tm
174
+ [(Anwendung:)-5285.2(Bei L)36.8(ymphdr�senentz�ndungen im Hals-Rachenraum)]TJ
175
+ 0 -1.4174 TD
176
+ -0.0002 Tc
177
+ 0.0000 Tw
178
+ [(Packung:)-5699.5(01)-305.7(001)-7141.9(10)-567.1(g)-18097.9(B)]TJ
179
+ T*
180
+ -0.0001 Tc
181
+ -0.0306 Tw
182
+ [(G�ltig bis:)-6647.8(19. M�rz 2008)]TJ
183
+ EOS
184
+ assert_nil(Stream::ET_PATTERN.match(src))
185
+ end
186
+ def test_et__carriage_return
187
+ src = <<-EOS
188
+ BT\r/F2 1 Tf\r6 0 0 6 476.805 591.0707 Tm\r/Cs5 cs 0 0 0 sc\r/GS1 gs\r0.2778 Tc\r0 Tw\r[(03.)-358.6(Jahrgang)]TJ\r0 -1.1667 TD\r(03)Tj\r3 0 0 3 486.8113 586.1957 Tm\r0 Tc\r(e)Tj\r6 0 0 6 492.2965 584.0707 Tm\r0.2779 Tc\r(ann\216e)Tj\r5 0 0 5 476.805 570.5707 Tm\r0.1111 Tc\r-0.0306 Tw\r(ISSN 0026-9212)Tj\r/F4 1 Tf\r14 0 0 14 476.5422 597.3198 Tm\r0 Tw\r(5/2004)Tj\rET\r
189
+ EOS
190
+ assert_not_nil(Stream::ET_PATTERN.match(src))
191
+ end
192
+
193
+ end
194
+ end
@@ -0,0 +1,315 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # TestTextState -- Rpdf2txt -- 29.11.2002 -- aschrafl@ywesee.com
24
+
25
+ $: << File.expand_path('../lib', File.dirname(__FILE__))
26
+ $: << File.dirname(__FILE__)
27
+
28
+ require 'test/unit'
29
+ require 'rpdf2txt/text_state'
30
+ require 'rpdf2txt/object'
31
+ require 'flexmock'
32
+
33
+ module Rpdf2txt
34
+ class TextState
35
+ attr_accessor :w, :char_spacing
36
+ end
37
+ end
38
+ class TestTextState < Test::Unit::TestCase
39
+ include FlexMock::TestCase
40
+ def setup
41
+ font_src = <<-EOS
42
+ 580 0 obj
43
+ <<
44
+ /Type /Font
45
+ /Subtype /Type1
46
+ /FirstChar 32
47
+ /LastChar 240
48
+ /Widths [ 278 389 500 556 556 1000 722 278 333 333 556 600 278 389 278 278
49
+ 556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
50
+ 800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
51
+ 556 778 611 556 556 722 667 1000 667 667 556 389 278 389 600 500
52
+ 278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
53
+ 611 611 389 444 389 611 556 889 556 556 500 333 222 333 600 278
54
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 0 0 611
55
+ 0 0 0 0 556 556 0 0 0 0 0 800 0 0 0 278 0 0 278 600 278 278 0 611
56
+ 278 278 278 278 278 0 0 278 0 0 0 0 0 278 0 278 278 0 0 0 278 0
57
+ 0 0 0 0 0 0 0 0 0 0 0 278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
58
+ 0 0 0 0 0 278 ]
59
+ /Encoding /MacRomanEncoding
60
+ /BaseFont /Frutiger-BoldItalic
61
+ /FontDescriptor 579 0 R
62
+ >>
63
+ endobj
64
+ EOS
65
+ @text_state = Rpdf2txt::TextState.new('latin1')
66
+ @font = Rpdf2txt::Font.new(font_src)
67
+ @text_state.set_font(@font)
68
+ end
69
+ def test_set
70
+ @text_state.set_x("42.7953")
71
+ @text_state.set_y("670.6528")
72
+ @text_state.update!
73
+ assert_equal(427953, (@text_state.x*10000).round)
74
+ assert_equal(6706528, (@text_state.y*10000).round)
75
+ end
76
+ def test_update_td
77
+ @text_state.set_x("100.1234")
78
+ @text_state.set_y("200.5678")
79
+ @text_state.update_x("1.2345")
80
+ @text_state.update_y("6.7890")
81
+ @text_state.update!
82
+ assert_equal(1013579, (@text_state.x*10000).to_i)
83
+ assert_equal(2073568, (@text_state.y*10000).to_i)
84
+ @text_state.update_x("-1.2345")
85
+ @text_state.update_y("-6.7890")
86
+ @text_state.update!
87
+ assert_equal(1001234, (@text_state.x*10000).to_i)
88
+ assert_equal(2005678, (@text_state.y*10000).to_i)
89
+ end
90
+ def test_update_tD
91
+ @text_state.set_x("100.1234")
92
+ @text_state.set_y("200.5678")
93
+ @text_state.set_lead("-6.7890")
94
+ @text_state.update_x("1.2345")
95
+ @text_state.update_y("6.7890")
96
+ @text_state.update!
97
+ assert_equal(-6789.0, @text_state.lead*1000)
98
+ assert_equal(1013579, (@text_state.x*10000).to_i)
99
+ assert_equal(2073568, (@text_state.y*10000).to_i)
100
+ @text_state.set_lead(-1.2345)
101
+ @text_state.set_lead(6.7890)
102
+ @text_state.update_x("-1.2345")
103
+ @text_state.update_y("-6.7890")
104
+ @text_state.update!
105
+ assert_equal(6789.0, @text_state.lead*1000)
106
+ assert_equal(1001234, (@text_state.x*10000).to_i)
107
+ assert_equal(2005678, (@text_state.y*10000).to_i)
108
+ end
109
+ def test_set_lead_tl
110
+ @text_state.set_lead("1.2345")
111
+ assert_equal(1234.5, @text_state.lead*1000)
112
+ @text_state.set_lead("-6.7890")
113
+ assert_equal(-6789.0, @text_state.lead*1000)
114
+ end
115
+ def test_step
116
+ @text_state.set_lead(-3.4567)
117
+ @text_state.set_y(200.1234)
118
+ @text_state.set_x(400.5678)
119
+ @text_state.set_txt('foo')
120
+ @text_state.advance_x
121
+ @text_state.update!
122
+ assert_equal(2001234, (@text_state.y * 10000).to_i)
123
+ assert_not_equal(4005678, (@text_state.x * 10000).to_i)
124
+ @text_state.step
125
+ @text_state.update!
126
+ assert_equal(1966667, (@text_state.y * 10000).to_i)
127
+ assert_equal(4005678, (@text_state.x * 10000).to_i)
128
+ end
129
+ def test_compare1
130
+ @text_state.set_font_size(0.0)
131
+ @text_state.set_x 100.1234
132
+ @text_state.set_y 200.5678
133
+ @text_state.update!
134
+ text_state = Rpdf2txt::TextState.new
135
+ text_state.set_font_size(0.0)
136
+ text_state.set_x 88.9012
137
+ text_state.set_y 250.3456
138
+ text_state.update!
139
+ assert(text_state > @text_state, text_state <=> @text_state)
140
+ end
141
+ def test_compare2
142
+ @text_state.set_font_size(10)
143
+ @text_state.set_x 100.1234
144
+ @text_state.set_y 200.5678
145
+ @text_state.update!
146
+ text_state = Rpdf2txt::TextState.new
147
+ text_state.set_font_size(10)
148
+ text_state.set_x 88.9012
149
+ text_state.set_y 200.5678
150
+ text_state.update!
151
+ assert(text_state < @text_state, text_state <=> @text_state)
152
+ end
153
+ def test_same_word
154
+ (p1 = Rpdf2txt::TextState.new).set_x -10000
155
+ (p1 = Rpdf2txt::TextState.new).set_y 10000
156
+ (p2 = Rpdf2txt::TextState.new).set_x -5000
157
+ (p2 = Rpdf2txt::TextState.new).set_y 10000
158
+ (p3 = Rpdf2txt::TextState.new).set_x 5000
159
+ (p3 = Rpdf2txt::TextState.new).set_y 10000
160
+ (p4 = Rpdf2txt::TextState.new).set_x 10000
161
+ (p4 = Rpdf2txt::TextState.new).set_y 10000
162
+ p1.set_font_size(10)
163
+ p2.set_font_size(10)
164
+ p3.set_font_size(10)
165
+ p4.set_font_size(10)
166
+ p1.update!
167
+ p2.update!
168
+ p3.update!
169
+ p4.update!
170
+ assert_equal(true, p1.same_word(p1))
171
+ assert_equal(true, p1.same_word(p2))
172
+ assert_equal(true, p1.same_word(p3))
173
+ assert_equal(true, p1.same_word(p4))
174
+ assert_equal(true, p2.same_word(p2))
175
+ assert_equal(true, p2.same_word(p3))
176
+ assert_equal(true, p2.same_word(p4))
177
+ assert_equal(true, p3.same_word(p3))
178
+ assert_equal(true, p3.same_word(p4))
179
+ assert_equal(true, p4.same_word(p4))
180
+ p1.set_y -10
181
+ p1.update!
182
+ assert_equal(true, p1.same_word(p1))
183
+ assert_equal(false, p1.same_word(p2))
184
+ assert_equal(false, p1.same_word(p3))
185
+ assert_equal(false, p1.same_word(p4))
186
+ end
187
+ def test_same_line
188
+ ts1 = Rpdf2txt::TextState.new
189
+ ts1.set_y(210)
190
+ ts1.set_font_size(10)
191
+ ts1.update!
192
+ assert_equal(210.000, ts1.y)
193
+ assert_equal(200.000, ts1.y2)
194
+
195
+ ts2 = Rpdf2txt::TextState.new
196
+ ts2.set_y(200)
197
+ ts2.set_font_size(10)
198
+ ts2.update!
199
+ assert_equal(200.000, ts2.y)
200
+ assert_equal(190.000, ts2.y2)
201
+
202
+ # -----
203
+ # ts1
204
+ # ----- ----- => not same line
205
+ # ts2
206
+ # -----
207
+ assert_equal(false, ts1.same_line(ts2))
208
+ assert_equal(false, ts2.same_line(ts1))
209
+
210
+ ts3 = Rpdf2txt::TextState.new
211
+ ts3.set_y(205)
212
+ ts3.set_font_size(10)
213
+ ts3.update!
214
+ assert_equal(205.000, ts3.y)
215
+ assert_equal(195.000, ts3.y2)
216
+
217
+ # -----
218
+ # ts1 -----
219
+ # ----- ts3 => same line
220
+ # -----
221
+ assert_equal(true, ts1.same_line(ts3))
222
+ assert_equal(true, ts3.same_line(ts1))
223
+
224
+ # -----
225
+ # ----- ts3 => same line
226
+ # ts2 -----
227
+ # -----
228
+ assert_equal(true, ts2.same_line(ts3))
229
+ assert_equal(true, ts3.same_line(ts2))
230
+
231
+ ts4 = Rpdf2txt::TextState.new
232
+ ts4.set_y(210)
233
+ ts4.set_font_size(30)
234
+ ts4.update!
235
+ assert_equal(210.000, ts4.y)
236
+ assert_equal(180.000, ts4.y2)
237
+
238
+ # -----
239
+ # -----
240
+ # ts1 ts4 => same line
241
+ # -----
242
+ # -----
243
+ assert_equal(true, ts1.same_line(ts4))
244
+ assert_equal(true, ts4.same_line(ts1))
245
+ end
246
+ def test_set_txt
247
+ @text_state.set_txt("Hello World")
248
+ assert_in_delta(5.612, @text_state.w, 0.001)
249
+ end
250
+ def test_set_char_spacing
251
+ assert_equal(0, @text_state.char_spacing)
252
+ @text_state.set_char_spacing('-0.456 Tc')
253
+ assert_equal(-456, @text_state.char_spacing)
254
+ @text_state.set_char_spacing('0.789 Tc')
255
+ assert_equal(789, @text_state.char_spacing)
256
+ end
257
+ def test_same_word2
258
+ (p1 = Rpdf2txt::TextState.new).set_x -10000
259
+ (p1 = Rpdf2txt::TextState.new).set_y 10000
260
+ (p2 = Rpdf2txt::TextState.new).set_x -5000
261
+ (p2 = Rpdf2txt::TextState.new).set_y 10000
262
+ (p3 = Rpdf2txt::TextState.new).set_x 5000
263
+ (p3 = Rpdf2txt::TextState.new).set_y 10000
264
+ (p4 = Rpdf2txt::TextState.new).set_x 10000
265
+ (p4 = Rpdf2txt::TextState.new).set_y 10000
266
+ p1.set_font_size(10.0)
267
+ p2.set_font_size(10.0)
268
+ p3.set_font_size(10.0)
269
+ p4.set_font_size(10.0)
270
+ chars = %w(a b c)
271
+ [p1,p2,p3,p4].each { |text_state|
272
+ text_state.set_font(@font)
273
+ text_state.set_txt(chars.join)
274
+ text_state.update!
275
+ chars = chars.collect { |char| char.next }
276
+ }
277
+ assert_equal(true, p1.same_word(p1))
278
+ assert_equal(true, p1.same_word(p2))
279
+ assert_equal(true, p1.same_word(p3))
280
+ assert_equal(true, p1.same_word(p4))
281
+ assert_equal(true, p2.same_word(p2))
282
+ assert_equal(true, p2.same_word(p3))
283
+ assert_equal(true, p2.same_word(p4))
284
+ assert_equal(true, p3.same_word(p3))
285
+ assert_equal(true, p3.same_word(p4))
286
+ assert_equal(true, p4.same_word(p4))
287
+ p1.set_y -10
288
+ p1.update!
289
+ assert_equal(true, p1.same_word(p1))
290
+ assert_equal(false, p1.same_word(p2))
291
+ assert_equal(false, p1.same_word(p3))
292
+ assert_equal(false, p1.same_word(p4))
293
+ end
294
+ def test_char_width
295
+ assert_equal(0.556, @text_state.char_width('a'))
296
+ assert_equal(0.278, @text_state.char_width(' '))
297
+ @text_state.set_char_spacing('0.023')
298
+ assert_equal(0.579, @text_state.char_width('a'))
299
+ assert_equal(0.301, @text_state.char_width(' '))
300
+ @text_state.set_word_spacing('0.012')
301
+ assert_equal(0.579, @text_state.char_width('a'))
302
+ assert_equal(0.313, @text_state.char_width(' '))
303
+ end
304
+ def test_txt
305
+ font = flexmock('font')
306
+ input = "Anwendung: Bei nerv�sen Herzbeschwerden"
307
+ font.should_receive(:encoding).and_return('mac')
308
+ font.should_receive(:attributes).and_return({})
309
+ font.should_ignore_missing
310
+ @text_state.set_font(font)
311
+ @text_state.set_txt(input)
312
+ expected = "Anwendung: Bei nerv�sen Herzbeschwerden"
313
+ assert_equal(expected, @text_state.txt)
314
+ end
315
+ end