rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env ruby
2
+ # TestSpaceBug052004 -- rpdf2txt -- 28.05.2004 -- hwyss@ywesee.com
3
+
4
+ $: << File.dirname(__FILE__)
5
+ $: << File.expand_path('../lib', File.dirname(__FILE__))
6
+
7
+ require 'test/unit'
8
+ require 'rpdf2txt/parser'
9
+
10
+ module Rpdf2txt
11
+ class TestSpaceBug052004 < Test::Unit::TestCase
12
+ class FontDonor
13
+ attr_accessor :fonts, :attributes
14
+ def initialize
15
+ @attributes = {}
16
+ end
17
+ def font(key)
18
+ @fonts[key]
19
+ end
20
+ end
21
+ def test_same_word
22
+ font3_src = <<-EOS
23
+ 7 0 obj
24
+ <<
25
+ /Type /Font
26
+ /Subtype /Type1
27
+ /FirstChar 32
28
+ /LastChar 240
29
+ /Widths [ 278 389 556 556 556 1000 722 278 333 333 556 600 278 333 278 278
30
+ 556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
31
+ 800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
32
+ 556 778 611 500 556 722 667 1000 667 667 556 333 278 333 600 500
33
+ 278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
34
+ 611 611 389 389 389 611 500 833 500 500 500 333 222 333 600 278
35
+ 722 0 0 0 0 0 722 0 556 556 556 0 0 0 556 556 556 0 0 0 278 278
36
+ 0 0 0 611 611 0 0 611 611 611 0 400 556 556 0 0 0 0 0 800 0 0 0
37
+ 278 0 0 278 600 278 278 0 611 278 278 278 278 278 0 0 278 0 0 0
38
+ 0 0 278 0 278 278 556 556 0 278 0 0 0 0 0 500 0 556 0 0 278 0 278
39
+ 0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 278 ]
40
+ /Encoding /MacRomanEncoding
41
+ /BaseFont /Frutiger-Roman
42
+ /FontDescriptor 381 0 R
43
+ >>
44
+ endobj
45
+ EOS
46
+ font30_src = <<-EOS
47
+ 74 0 obj
48
+ <<
49
+ /Type /Font
50
+ /Subtype /Type1
51
+ /FirstChar 32
52
+ /LastChar 181
53
+ /Widths [ 278 389 556 556 556 1000 722 278 333 333 556 600 278 333 278 278
54
+ 556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
55
+ 800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
56
+ 556 778 611 500 556 722 667 1000 667 667 556 333 278 333 600 500
57
+ 278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
58
+ 611 611 389 389 389 611 500 833 500 500 500 333 222 333 600 278
59
+ 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
60
+ 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
61
+ 278 278 556 556 278 278 278 278 278 800 278 278 278 278 278 278
62
+ 278 600 278 278 278 611 ]
63
+ /Encoding /WinAnsiEncoding
64
+ /BaseFont /Frutiger-Roman
65
+ /FontDescriptor 381 0 R
66
+ >>
67
+ endobj
68
+ EOS
69
+ font3 = Font.new(font3_src) # MacRoman Encoded, used for umlauts
70
+ font30 = Font.new(font30_src) # WinAnsi Encoded
71
+ path = File.expand_path('data/space_bug_stream.txt',
72
+ File.dirname(__FILE__))
73
+ stream = Stream.new(File.read(path))
74
+ page = FontDonor.new
75
+ page.fonts = {
76
+ :f3 => font3,
77
+ :f30 => font30,
78
+ }
79
+ snippets = []
80
+ snippets = stream.extract_text_objects(page, TextState.new)
81
+ sorted = snippets.sort
82
+ g_char = sorted[-4]
83
+ uml_char = sorted[-3]
84
+ assert_equal(true, uml_char.same_word(g_char))
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env ruby
2
+ # TestStream -- rpdf2txt -- 01.06.2005 -- hwyss@ywesee.com
3
+
4
+ $: << File.expand_path('../lib', File.dirname(__FILE__))
5
+
6
+ require 'test/unit'
7
+ require 'rpdf2txt/object'
8
+
9
+ module Rpdf2txt
10
+ class Stream < PdfObject
11
+ attr_writer :decoded_stream, :raw_stream
12
+ end
13
+ class TestStream < Test::Unit::TestCase
14
+ def setup
15
+ input = "3 0 obj << /Length 12 >> stream\n (Hello World!) endstream endobj"
16
+ @stream = Rpdf2txt::Stream.new(input)
17
+ end
18
+ def test_oneliner
19
+ src = <<-EOS
20
+ 1 0 0 1 -0.5 -0.5 cm q 455.043 -1.5 m 455.043 12.1 l 452.043 9.1 l 452.043 1.5 l
21
+ W* n 453.543 -0.5 m 453.543 11.1 l S Q q 455.043 12.1 m -1.5 12.1 l 1.5 9.1 l 452.043
22
+ 9.1 l W* n 454.043 10.6 m -0.5 10.6 l S Q q -1.5 12.1 m -1.5 -1.5 l 1.5 1.5 l 1.5
23
+ 9.1 l W* n 0 11.1 m 0 -0.5 l S Q 1 0 0 1 0 39.866 cm
24
+ BT /F1 16 Tf 1 0 0 -1 0 14.347 Tm(Ponstan�) Tj 0 0 0 RG ET
25
+ EOS
26
+ stream = Stream.new('')
27
+ stream.instance_variable_set('@decoded_stream', src)
28
+ snippets = stream.extract_text_objects(nil, TextState.new)
29
+ assert_equal(4, snippets.size, "no text snippets were found")
30
+ assert_instance_of(TextState, snippets.last)
31
+ end
32
+ def test_decode_raw_stream
33
+ file = File.expand_path('./data/firststream',
34
+ File.dirname(__FILE__))
35
+ deflated = File.read(file)
36
+ src = "stream\n#{deflated}endstream"
37
+ stream = Rpdf2txt::Stream.new(src)
38
+ stream.attributes.store(:filter, '/FlateDecode')
39
+ file = File.expand_path('./data/test.txt',
40
+ File.dirname(__FILE__))
41
+ expected = File.read(file)
42
+ assert_equal(expected, stream.decode_raw_stream)
43
+ end
44
+ def test_raw_stream
45
+ expected = ' (Hello World!) '
46
+ assert_equal(expected, @stream.raw_stream)
47
+ end
48
+ def test_decoded_stream
49
+ expected = ' (Hello World!) '
50
+ assert_equal(expected, @stream.decoded_stream)
51
+ end
52
+ def test_decoded_stream2
53
+ file = File.expand_path('./data/firststream', File.dirname(__FILE__))
54
+ @stream.raw_stream = File.read(file)
55
+ @stream.attributes[:filter] = "/FlateDecode"
56
+ file = File.expand_path('./data/test.txt', File.dirname(__FILE__))
57
+ expected = File.read(file)
58
+ assert_equal(expected, @stream.decoded_stream)
59
+ end
60
+ def test_extract_text_objects
61
+ file = File.expand_path('./data/stream.txt', File.dirname(__FILE__))
62
+ @stream.decoded_stream = File.read(file)
63
+ result = @stream.extract_text_objects(nil, TextState.new).select { |res|
64
+ res.is_a?(TextState)
65
+ }
66
+ assert_instance_of(Array, result)
67
+ assert_equal(69, result.size)
68
+ end
69
+ def test_extract_text_objects__artifact
70
+ @stream.decoded_stream = <<-EOS
71
+ BT
72
+ /P <</MCID 0 >>BDC
73
+ /CS0 cs 1 0 0 scn
74
+ /TT2 1 Tf
75
+ 7.98 0 0 7.98 42.54 718.7603 Tm
76
+ ( )Tj
77
+ /TT3 1 Tf
78
+ 6.0646 Tc 10.02 0 0 10.02 42.54 21.6803 Tm
79
+ ( )Tj
80
+ 22.635 0 Td
81
+ ( )Tj
82
+ 22.635 0 Td
83
+ ( )Tj
84
+ 0 g
85
+ /TT2 1 Tf
86
+ 0 Tc 0.0004 Tw 7.98 0 0 7.98 516.84 21.6803 Tm
87
+ (- Seite 1 / 32 - )Tj
88
+ EMC
89
+ /Artifact <</Type /Printing >>BDC
90
+ /TT0 1 Tf
91
+ 0 Tw -59.436 5.406 Td
92
+ (1)Tj
93
+ -0.0007 Tc 0.0049 Tw 7.02 0 0 7.02 46.98 59.3003 Tm
94
+ [(Siehe Legende a)8(u)3(f der let)8(z)-5(ten)12( Se)8(it)8(e des V)8(erze)8(i)-5(chnis)8(ses )]TJ
95
+ 0.0005 Tc 0 Tw -0.632 -1.299 Td
96
+ [(29.09)10(.09 )]TJ
97
+ /TT1 1 Tf
98
+ 0 Tc 7.98 0 0 7.98 42.54 41.2403 Tm
99
+ ( )Tj
100
+ /TT2 1 Tf
101
+ 9 0 0 9 42.54 31.0403 Tm
102
+ ( )Tj
103
+ EMC
104
+ ET
105
+ EOS
106
+ result = nil
107
+ assert_nothing_raised do
108
+ result = @stream.extract_text_objects(nil, TextState.new).select { |res|
109
+ res.is_a?(TextState)
110
+ }
111
+ end
112
+ assert_instance_of(Array, result)
113
+ assert_equal(22, result.size)
114
+ end
115
+ def test_extract_nontext_objects
116
+ src = <<-EOS
117
+ q Q q 12 12.240058 587.76001 767.76001 re W n /Cs1 cs 0 0 0 sc q 0.23999999
118
+ 0 0 0.23999999 64.800003 38.880058 cm
119
+ EOS
120
+ active = Matrix[[1,0,0], [0,1,0], [0,0,1]]
121
+ stack = [active]
122
+ expected = Matrix[[0.23999999, 0, 0],
123
+ [0, 0.23999999, 0],
124
+ [64.800003, 38.880058, 1]]
125
+ tmatrix = @stream.extract_nontext_objects(src, active, stack, [])
126
+ assert_equal(expected, tmatrix)
127
+ #assert_equal([0.23999999, 0, 0, 0.23999999, 64.800003, 38.880058], tmatrix)
128
+ end
129
+ def test_robust_et
130
+ src = <<-'EOS'
131
+ BT
132
+ 10 0 0 10 113.0394 341.8156 Tm
133
+ -0.0002 Tc
134
+ 0.0000 Tw
135
+ [(Zul.-Nr)91.6(.: )]TJ
136
+ /F10 1 Tf
137
+ 3.8771 0 TD
138
+ (55994)Tj
139
+ /F3 1 Tf
140
+ 8.8787 0 TD
141
+ -0.0001 Tc
142
+ [(V)36.8(erkaufskategorie: )]TJ
143
+ /F10 1 Tf
144
+ 9.1015 0 TD
145
+ 0.0000 Tc
146
+ (B)Tj
147
+ /F3 1 Tf
148
+ 3.6544 0 TD
149
+ -0.0001 Tc
150
+ -0.0306 Tw
151
+ [(Index: 20.01.0.)-9563.5(20.03.2003)]TJ
152
+ -25.5118 -2.2428 TD
153
+ 0.0000 Tw
154
+ [(Zusammensetzung:)-921.1(01)]TJ
155
+ 8.3 0 0 8.3 226.4252 319.3875 Tm
156
+ -0.0603 Tw
157
+ [(BARII CARBONAS D12 8.8)-139.1(%, CALCII IODIDUM D4 14.5)-139.1(%, CISTUS CANADE)]TJ
158
+ 33.6765 0 TD
159
+ [(NSIS D4 16.5)-139.1(%, CO-)]TJ
160
+ -33.6765 -1.3661 TD
161
+ -0.0203 Tw
162
+ [(NIUM MACULA)73.7(TUM D8 14.7)-139.1(%, ECHINACEA ANGUSTIFOLIA ET \(AUT\) P)36.7(ALLID)]TJ
163
+ 35.2499 0 TD
164
+ (A spag. Peka D5)Tj
165
+ -35.2499 -1.3661 TD
166
+ -0.1016 Tw
167
+ [(16.5)-139.1(%, JUGLANS REGIA spag. Peka D4 14.5)-139.1(%, SCROPHULARIA NODOSA D)]TJ
168
+ 33.6965 0 TD
169
+ [(4 14.5)-139.1(%, EXCIPIENS)]TJ
170
+ -33.6965 -1.3661 TD
171
+ -0.0306 Tw
172
+ (ad GLOBULOS.)Tj
173
+ 10 0 0 10 113.0394 271.1969 Tm
174
+ [(Anwendung:)-5285.2(Bei L)36.8(ymphdr�senentz�ndungen im Hals-Rachenraum)]TJ
175
+ 0 -1.4174 TD
176
+ -0.0002 Tc
177
+ 0.0000 Tw
178
+ [(Packung:)-5699.5(01)-305.7(001)-7141.9(10)-567.1(g)-18097.9(B)]TJ
179
+ T*
180
+ -0.0001 Tc
181
+ -0.0306 Tw
182
+ [(G�ltig bis:)-6647.8(19. M�rz 2008)]TJ
183
+ EOS
184
+ assert_nil(Stream::ET_PATTERN.match(src))
185
+ end
186
+ def test_et__carriage_return
187
+ src = <<-EOS
188
+ BT\r/F2 1 Tf\r6 0 0 6 476.805 591.0707 Tm\r/Cs5 cs 0 0 0 sc\r/GS1 gs\r0.2778 Tc\r0 Tw\r[(03.)-358.6(Jahrgang)]TJ\r0 -1.1667 TD\r(03)Tj\r3 0 0 3 486.8113 586.1957 Tm\r0 Tc\r(e)Tj\r6 0 0 6 492.2965 584.0707 Tm\r0.2779 Tc\r(ann\216e)Tj\r5 0 0 5 476.805 570.5707 Tm\r0.1111 Tc\r-0.0306 Tw\r(ISSN 0026-9212)Tj\r/F4 1 Tf\r14 0 0 14 476.5422 597.3198 Tm\r0 Tw\r(5/2004)Tj\rET\r
189
+ EOS
190
+ assert_not_nil(Stream::ET_PATTERN.match(src))
191
+ end
192
+
193
+ end
194
+ end
@@ -0,0 +1,315 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # TestTextState -- Rpdf2txt -- 29.11.2002 -- aschrafl@ywesee.com
24
+
25
+ $: << File.expand_path('../lib', File.dirname(__FILE__))
26
+ $: << File.dirname(__FILE__)
27
+
28
+ require 'test/unit'
29
+ require 'rpdf2txt/text_state'
30
+ require 'rpdf2txt/object'
31
+ require 'flexmock'
32
+
33
+ module Rpdf2txt
34
+ class TextState
35
+ attr_accessor :w, :char_spacing
36
+ end
37
+ end
38
+ class TestTextState < Test::Unit::TestCase
39
+ include FlexMock::TestCase
40
+ def setup
41
+ font_src = <<-EOS
42
+ 580 0 obj
43
+ <<
44
+ /Type /Font
45
+ /Subtype /Type1
46
+ /FirstChar 32
47
+ /LastChar 240
48
+ /Widths [ 278 389 500 556 556 1000 722 278 333 333 556 600 278 389 278 278
49
+ 556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
50
+ 800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
51
+ 556 778 611 556 556 722 667 1000 667 667 556 389 278 389 600 500
52
+ 278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
53
+ 611 611 389 444 389 611 556 889 556 556 500 333 222 333 600 278
54
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 0 0 611
55
+ 0 0 0 0 556 556 0 0 0 0 0 800 0 0 0 278 0 0 278 600 278 278 0 611
56
+ 278 278 278 278 278 0 0 278 0 0 0 0 0 278 0 278 278 0 0 0 278 0
57
+ 0 0 0 0 0 0 0 0 0 0 0 278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
58
+ 0 0 0 0 0 278 ]
59
+ /Encoding /MacRomanEncoding
60
+ /BaseFont /Frutiger-BoldItalic
61
+ /FontDescriptor 579 0 R
62
+ >>
63
+ endobj
64
+ EOS
65
+ @text_state = Rpdf2txt::TextState.new('latin1')
66
+ @font = Rpdf2txt::Font.new(font_src)
67
+ @text_state.set_font(@font)
68
+ end
69
+ def test_set
70
+ @text_state.set_x("42.7953")
71
+ @text_state.set_y("670.6528")
72
+ @text_state.update!
73
+ assert_equal(427953, (@text_state.x*10000).round)
74
+ assert_equal(6706528, (@text_state.y*10000).round)
75
+ end
76
+ def test_update_td
77
+ @text_state.set_x("100.1234")
78
+ @text_state.set_y("200.5678")
79
+ @text_state.update_x("1.2345")
80
+ @text_state.update_y("6.7890")
81
+ @text_state.update!
82
+ assert_equal(1013579, (@text_state.x*10000).to_i)
83
+ assert_equal(2073568, (@text_state.y*10000).to_i)
84
+ @text_state.update_x("-1.2345")
85
+ @text_state.update_y("-6.7890")
86
+ @text_state.update!
87
+ assert_equal(1001234, (@text_state.x*10000).to_i)
88
+ assert_equal(2005678, (@text_state.y*10000).to_i)
89
+ end
90
+ def test_update_tD
91
+ @text_state.set_x("100.1234")
92
+ @text_state.set_y("200.5678")
93
+ @text_state.set_lead("-6.7890")
94
+ @text_state.update_x("1.2345")
95
+ @text_state.update_y("6.7890")
96
+ @text_state.update!
97
+ assert_equal(-6789.0, @text_state.lead*1000)
98
+ assert_equal(1013579, (@text_state.x*10000).to_i)
99
+ assert_equal(2073568, (@text_state.y*10000).to_i)
100
+ @text_state.set_lead(-1.2345)
101
+ @text_state.set_lead(6.7890)
102
+ @text_state.update_x("-1.2345")
103
+ @text_state.update_y("-6.7890")
104
+ @text_state.update!
105
+ assert_equal(6789.0, @text_state.lead*1000)
106
+ assert_equal(1001234, (@text_state.x*10000).to_i)
107
+ assert_equal(2005678, (@text_state.y*10000).to_i)
108
+ end
109
+ def test_set_lead_tl
110
+ @text_state.set_lead("1.2345")
111
+ assert_equal(1234.5, @text_state.lead*1000)
112
+ @text_state.set_lead("-6.7890")
113
+ assert_equal(-6789.0, @text_state.lead*1000)
114
+ end
115
+ def test_step
116
+ @text_state.set_lead(-3.4567)
117
+ @text_state.set_y(200.1234)
118
+ @text_state.set_x(400.5678)
119
+ @text_state.set_txt('foo')
120
+ @text_state.advance_x
121
+ @text_state.update!
122
+ assert_equal(2001234, (@text_state.y * 10000).to_i)
123
+ assert_not_equal(4005678, (@text_state.x * 10000).to_i)
124
+ @text_state.step
125
+ @text_state.update!
126
+ assert_equal(1966667, (@text_state.y * 10000).to_i)
127
+ assert_equal(4005678, (@text_state.x * 10000).to_i)
128
+ end
129
+ def test_compare1
130
+ @text_state.set_font_size(0.0)
131
+ @text_state.set_x 100.1234
132
+ @text_state.set_y 200.5678
133
+ @text_state.update!
134
+ text_state = Rpdf2txt::TextState.new
135
+ text_state.set_font_size(0.0)
136
+ text_state.set_x 88.9012
137
+ text_state.set_y 250.3456
138
+ text_state.update!
139
+ assert(text_state > @text_state, text_state <=> @text_state)
140
+ end
141
+ def test_compare2
142
+ @text_state.set_font_size(10)
143
+ @text_state.set_x 100.1234
144
+ @text_state.set_y 200.5678
145
+ @text_state.update!
146
+ text_state = Rpdf2txt::TextState.new
147
+ text_state.set_font_size(10)
148
+ text_state.set_x 88.9012
149
+ text_state.set_y 200.5678
150
+ text_state.update!
151
+ assert(text_state < @text_state, text_state <=> @text_state)
152
+ end
153
+ def test_same_word
154
+ (p1 = Rpdf2txt::TextState.new).set_x -10000
155
+ (p1 = Rpdf2txt::TextState.new).set_y 10000
156
+ (p2 = Rpdf2txt::TextState.new).set_x -5000
157
+ (p2 = Rpdf2txt::TextState.new).set_y 10000
158
+ (p3 = Rpdf2txt::TextState.new).set_x 5000
159
+ (p3 = Rpdf2txt::TextState.new).set_y 10000
160
+ (p4 = Rpdf2txt::TextState.new).set_x 10000
161
+ (p4 = Rpdf2txt::TextState.new).set_y 10000
162
+ p1.set_font_size(10)
163
+ p2.set_font_size(10)
164
+ p3.set_font_size(10)
165
+ p4.set_font_size(10)
166
+ p1.update!
167
+ p2.update!
168
+ p3.update!
169
+ p4.update!
170
+ assert_equal(true, p1.same_word(p1))
171
+ assert_equal(true, p1.same_word(p2))
172
+ assert_equal(true, p1.same_word(p3))
173
+ assert_equal(true, p1.same_word(p4))
174
+ assert_equal(true, p2.same_word(p2))
175
+ assert_equal(true, p2.same_word(p3))
176
+ assert_equal(true, p2.same_word(p4))
177
+ assert_equal(true, p3.same_word(p3))
178
+ assert_equal(true, p3.same_word(p4))
179
+ assert_equal(true, p4.same_word(p4))
180
+ p1.set_y -10
181
+ p1.update!
182
+ assert_equal(true, p1.same_word(p1))
183
+ assert_equal(false, p1.same_word(p2))
184
+ assert_equal(false, p1.same_word(p3))
185
+ assert_equal(false, p1.same_word(p4))
186
+ end
187
+ def test_same_line
188
+ ts1 = Rpdf2txt::TextState.new
189
+ ts1.set_y(210)
190
+ ts1.set_font_size(10)
191
+ ts1.update!
192
+ assert_equal(210.000, ts1.y)
193
+ assert_equal(200.000, ts1.y2)
194
+
195
+ ts2 = Rpdf2txt::TextState.new
196
+ ts2.set_y(200)
197
+ ts2.set_font_size(10)
198
+ ts2.update!
199
+ assert_equal(200.000, ts2.y)
200
+ assert_equal(190.000, ts2.y2)
201
+
202
+ # -----
203
+ # ts1
204
+ # ----- ----- => not same line
205
+ # ts2
206
+ # -----
207
+ assert_equal(false, ts1.same_line(ts2))
208
+ assert_equal(false, ts2.same_line(ts1))
209
+
210
+ ts3 = Rpdf2txt::TextState.new
211
+ ts3.set_y(205)
212
+ ts3.set_font_size(10)
213
+ ts3.update!
214
+ assert_equal(205.000, ts3.y)
215
+ assert_equal(195.000, ts3.y2)
216
+
217
+ # -----
218
+ # ts1 -----
219
+ # ----- ts3 => same line
220
+ # -----
221
+ assert_equal(true, ts1.same_line(ts3))
222
+ assert_equal(true, ts3.same_line(ts1))
223
+
224
+ # -----
225
+ # ----- ts3 => same line
226
+ # ts2 -----
227
+ # -----
228
+ assert_equal(true, ts2.same_line(ts3))
229
+ assert_equal(true, ts3.same_line(ts2))
230
+
231
+ ts4 = Rpdf2txt::TextState.new
232
+ ts4.set_y(210)
233
+ ts4.set_font_size(30)
234
+ ts4.update!
235
+ assert_equal(210.000, ts4.y)
236
+ assert_equal(180.000, ts4.y2)
237
+
238
+ # -----
239
+ # -----
240
+ # ts1 ts4 => same line
241
+ # -----
242
+ # -----
243
+ assert_equal(true, ts1.same_line(ts4))
244
+ assert_equal(true, ts4.same_line(ts1))
245
+ end
246
+ def test_set_txt
247
+ @text_state.set_txt("Hello World")
248
+ assert_in_delta(5.612, @text_state.w, 0.001)
249
+ end
250
+ def test_set_char_spacing
251
+ assert_equal(0, @text_state.char_spacing)
252
+ @text_state.set_char_spacing('-0.456 Tc')
253
+ assert_equal(-456, @text_state.char_spacing)
254
+ @text_state.set_char_spacing('0.789 Tc')
255
+ assert_equal(789, @text_state.char_spacing)
256
+ end
257
+ def test_same_word2
258
+ (p1 = Rpdf2txt::TextState.new).set_x -10000
259
+ (p1 = Rpdf2txt::TextState.new).set_y 10000
260
+ (p2 = Rpdf2txt::TextState.new).set_x -5000
261
+ (p2 = Rpdf2txt::TextState.new).set_y 10000
262
+ (p3 = Rpdf2txt::TextState.new).set_x 5000
263
+ (p3 = Rpdf2txt::TextState.new).set_y 10000
264
+ (p4 = Rpdf2txt::TextState.new).set_x 10000
265
+ (p4 = Rpdf2txt::TextState.new).set_y 10000
266
+ p1.set_font_size(10.0)
267
+ p2.set_font_size(10.0)
268
+ p3.set_font_size(10.0)
269
+ p4.set_font_size(10.0)
270
+ chars = %w(a b c)
271
+ [p1,p2,p3,p4].each { |text_state|
272
+ text_state.set_font(@font)
273
+ text_state.set_txt(chars.join)
274
+ text_state.update!
275
+ chars = chars.collect { |char| char.next }
276
+ }
277
+ assert_equal(true, p1.same_word(p1))
278
+ assert_equal(true, p1.same_word(p2))
279
+ assert_equal(true, p1.same_word(p3))
280
+ assert_equal(true, p1.same_word(p4))
281
+ assert_equal(true, p2.same_word(p2))
282
+ assert_equal(true, p2.same_word(p3))
283
+ assert_equal(true, p2.same_word(p4))
284
+ assert_equal(true, p3.same_word(p3))
285
+ assert_equal(true, p3.same_word(p4))
286
+ assert_equal(true, p4.same_word(p4))
287
+ p1.set_y -10
288
+ p1.update!
289
+ assert_equal(true, p1.same_word(p1))
290
+ assert_equal(false, p1.same_word(p2))
291
+ assert_equal(false, p1.same_word(p3))
292
+ assert_equal(false, p1.same_word(p4))
293
+ end
294
+ def test_char_width
295
+ assert_equal(0.556, @text_state.char_width('a'))
296
+ assert_equal(0.278, @text_state.char_width(' '))
297
+ @text_state.set_char_spacing('0.023')
298
+ assert_equal(0.579, @text_state.char_width('a'))
299
+ assert_equal(0.301, @text_state.char_width(' '))
300
+ @text_state.set_word_spacing('0.012')
301
+ assert_equal(0.579, @text_state.char_width('a'))
302
+ assert_equal(0.313, @text_state.char_width(' '))
303
+ end
304
+ def test_txt
305
+ font = flexmock('font')
306
+ input = "Anwendung: Bei nerv�sen Herzbeschwerden"
307
+ font.should_receive(:encoding).and_return('mac')
308
+ font.should_receive(:attributes).and_return({})
309
+ font.should_ignore_missing
310
+ @text_state.set_font(font)
311
+ @text_state.set_txt(input)
312
+ expected = "Anwendung: Bei nerv�sen Herzbeschwerden"
313
+ assert_equal(expected, @text_state.txt)
314
+ end
315
+ end