rpdf2txt 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# TestSpaceBug052004 -- rpdf2txt -- 28.05.2004 -- hwyss@ywesee.com
|
3
|
+
|
4
|
+
$: << File.dirname(__FILE__)
|
5
|
+
$: << File.expand_path('../lib', File.dirname(__FILE__))
|
6
|
+
|
7
|
+
require 'test/unit'
|
8
|
+
require 'rpdf2txt/parser'
|
9
|
+
|
10
|
+
module Rpdf2txt
|
11
|
+
class TestSpaceBug052004 < Test::Unit::TestCase
|
12
|
+
class FontDonor
|
13
|
+
attr_accessor :fonts, :attributes
|
14
|
+
def initialize
|
15
|
+
@attributes = {}
|
16
|
+
end
|
17
|
+
def font(key)
|
18
|
+
@fonts[key]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
def test_same_word
|
22
|
+
font3_src = <<-EOS
|
23
|
+
7 0 obj
|
24
|
+
<<
|
25
|
+
/Type /Font
|
26
|
+
/Subtype /Type1
|
27
|
+
/FirstChar 32
|
28
|
+
/LastChar 240
|
29
|
+
/Widths [ 278 389 556 556 556 1000 722 278 333 333 556 600 278 333 278 278
|
30
|
+
556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
|
31
|
+
800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
|
32
|
+
556 778 611 500 556 722 667 1000 667 667 556 333 278 333 600 500
|
33
|
+
278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
|
34
|
+
611 611 389 389 389 611 500 833 500 500 500 333 222 333 600 278
|
35
|
+
722 0 0 0 0 0 722 0 556 556 556 0 0 0 556 556 556 0 0 0 278 278
|
36
|
+
0 0 0 611 611 0 0 611 611 611 0 400 556 556 0 0 0 0 0 800 0 0 0
|
37
|
+
278 0 0 278 600 278 278 0 611 278 278 278 278 278 0 0 278 0 0 0
|
38
|
+
0 0 278 0 278 278 556 556 0 278 0 0 0 0 0 500 0 556 0 0 278 0 278
|
39
|
+
0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 278 ]
|
40
|
+
/Encoding /MacRomanEncoding
|
41
|
+
/BaseFont /Frutiger-Roman
|
42
|
+
/FontDescriptor 381 0 R
|
43
|
+
>>
|
44
|
+
endobj
|
45
|
+
EOS
|
46
|
+
font30_src = <<-EOS
|
47
|
+
74 0 obj
|
48
|
+
<<
|
49
|
+
/Type /Font
|
50
|
+
/Subtype /Type1
|
51
|
+
/FirstChar 32
|
52
|
+
/LastChar 181
|
53
|
+
/Widths [ 278 389 556 556 556 1000 722 278 333 333 556 600 278 333 278 278
|
54
|
+
556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
|
55
|
+
800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
|
56
|
+
556 778 611 500 556 722 667 1000 667 667 556 333 278 333 600 500
|
57
|
+
278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
|
58
|
+
611 611 389 389 389 611 500 833 500 500 500 333 222 333 600 278
|
59
|
+
278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
|
60
|
+
278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
|
61
|
+
278 278 556 556 278 278 278 278 278 800 278 278 278 278 278 278
|
62
|
+
278 600 278 278 278 611 ]
|
63
|
+
/Encoding /WinAnsiEncoding
|
64
|
+
/BaseFont /Frutiger-Roman
|
65
|
+
/FontDescriptor 381 0 R
|
66
|
+
>>
|
67
|
+
endobj
|
68
|
+
EOS
|
69
|
+
font3 = Font.new(font3_src) # MacRoman Encoded, used for umlauts
|
70
|
+
font30 = Font.new(font30_src) # WinAnsi Encoded
|
71
|
+
path = File.expand_path('data/space_bug_stream.txt',
|
72
|
+
File.dirname(__FILE__))
|
73
|
+
stream = Stream.new(File.read(path))
|
74
|
+
page = FontDonor.new
|
75
|
+
page.fonts = {
|
76
|
+
:f3 => font3,
|
77
|
+
:f30 => font30,
|
78
|
+
}
|
79
|
+
snippets = []
|
80
|
+
snippets = stream.extract_text_objects(page, TextState.new)
|
81
|
+
sorted = snippets.sort
|
82
|
+
g_char = sorted[-4]
|
83
|
+
uml_char = sorted[-3]
|
84
|
+
assert_equal(true, uml_char.same_word(g_char))
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/test/test_stream.rb
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# TestStream -- rpdf2txt -- 01.06.2005 -- hwyss@ywesee.com
|
3
|
+
|
4
|
+
$: << File.expand_path('../lib', File.dirname(__FILE__))
|
5
|
+
|
6
|
+
require 'test/unit'
|
7
|
+
require 'rpdf2txt/object'
|
8
|
+
|
9
|
+
module Rpdf2txt
|
10
|
+
class Stream < PdfObject
|
11
|
+
attr_writer :decoded_stream, :raw_stream
|
12
|
+
end
|
13
|
+
class TestStream < Test::Unit::TestCase
|
14
|
+
def setup
|
15
|
+
input = "3 0 obj << /Length 12 >> stream\n (Hello World!) endstream endobj"
|
16
|
+
@stream = Rpdf2txt::Stream.new(input)
|
17
|
+
end
|
18
|
+
def test_oneliner
|
19
|
+
src = <<-EOS
|
20
|
+
1 0 0 1 -0.5 -0.5 cm q 455.043 -1.5 m 455.043 12.1 l 452.043 9.1 l 452.043 1.5 l
|
21
|
+
W* n 453.543 -0.5 m 453.543 11.1 l S Q q 455.043 12.1 m -1.5 12.1 l 1.5 9.1 l 452.043
|
22
|
+
9.1 l W* n 454.043 10.6 m -0.5 10.6 l S Q q -1.5 12.1 m -1.5 -1.5 l 1.5 1.5 l 1.5
|
23
|
+
9.1 l W* n 0 11.1 m 0 -0.5 l S Q 1 0 0 1 0 39.866 cm
|
24
|
+
BT /F1 16 Tf 1 0 0 -1 0 14.347 Tm(Ponstan�) Tj 0 0 0 RG ET
|
25
|
+
EOS
|
26
|
+
stream = Stream.new('')
|
27
|
+
stream.instance_variable_set('@decoded_stream', src)
|
28
|
+
snippets = stream.extract_text_objects(nil, TextState.new)
|
29
|
+
assert_equal(4, snippets.size, "no text snippets were found")
|
30
|
+
assert_instance_of(TextState, snippets.last)
|
31
|
+
end
|
32
|
+
def test_decode_raw_stream
|
33
|
+
file = File.expand_path('./data/firststream',
|
34
|
+
File.dirname(__FILE__))
|
35
|
+
deflated = File.read(file)
|
36
|
+
src = "stream\n#{deflated}endstream"
|
37
|
+
stream = Rpdf2txt::Stream.new(src)
|
38
|
+
stream.attributes.store(:filter, '/FlateDecode')
|
39
|
+
file = File.expand_path('./data/test.txt',
|
40
|
+
File.dirname(__FILE__))
|
41
|
+
expected = File.read(file)
|
42
|
+
assert_equal(expected, stream.decode_raw_stream)
|
43
|
+
end
|
44
|
+
def test_raw_stream
|
45
|
+
expected = ' (Hello World!) '
|
46
|
+
assert_equal(expected, @stream.raw_stream)
|
47
|
+
end
|
48
|
+
def test_decoded_stream
|
49
|
+
expected = ' (Hello World!) '
|
50
|
+
assert_equal(expected, @stream.decoded_stream)
|
51
|
+
end
|
52
|
+
def test_decoded_stream2
|
53
|
+
file = File.expand_path('./data/firststream', File.dirname(__FILE__))
|
54
|
+
@stream.raw_stream = File.read(file)
|
55
|
+
@stream.attributes[:filter] = "/FlateDecode"
|
56
|
+
file = File.expand_path('./data/test.txt', File.dirname(__FILE__))
|
57
|
+
expected = File.read(file)
|
58
|
+
assert_equal(expected, @stream.decoded_stream)
|
59
|
+
end
|
60
|
+
def test_extract_text_objects
|
61
|
+
file = File.expand_path('./data/stream.txt', File.dirname(__FILE__))
|
62
|
+
@stream.decoded_stream = File.read(file)
|
63
|
+
result = @stream.extract_text_objects(nil, TextState.new).select { |res|
|
64
|
+
res.is_a?(TextState)
|
65
|
+
}
|
66
|
+
assert_instance_of(Array, result)
|
67
|
+
assert_equal(69, result.size)
|
68
|
+
end
|
69
|
+
def test_extract_text_objects__artifact
|
70
|
+
@stream.decoded_stream = <<-EOS
|
71
|
+
BT
|
72
|
+
/P <</MCID 0 >>BDC
|
73
|
+
/CS0 cs 1 0 0 scn
|
74
|
+
/TT2 1 Tf
|
75
|
+
7.98 0 0 7.98 42.54 718.7603 Tm
|
76
|
+
( )Tj
|
77
|
+
/TT3 1 Tf
|
78
|
+
6.0646 Tc 10.02 0 0 10.02 42.54 21.6803 Tm
|
79
|
+
( )Tj
|
80
|
+
22.635 0 Td
|
81
|
+
( )Tj
|
82
|
+
22.635 0 Td
|
83
|
+
( )Tj
|
84
|
+
0 g
|
85
|
+
/TT2 1 Tf
|
86
|
+
0 Tc 0.0004 Tw 7.98 0 0 7.98 516.84 21.6803 Tm
|
87
|
+
(- Seite 1 / 32 - )Tj
|
88
|
+
EMC
|
89
|
+
/Artifact <</Type /Printing >>BDC
|
90
|
+
/TT0 1 Tf
|
91
|
+
0 Tw -59.436 5.406 Td
|
92
|
+
(1)Tj
|
93
|
+
-0.0007 Tc 0.0049 Tw 7.02 0 0 7.02 46.98 59.3003 Tm
|
94
|
+
[(Siehe Legende a)8(u)3(f der let)8(z)-5(ten)12( Se)8(it)8(e des V)8(erze)8(i)-5(chnis)8(ses )]TJ
|
95
|
+
0.0005 Tc 0 Tw -0.632 -1.299 Td
|
96
|
+
[(29.09)10(.09 )]TJ
|
97
|
+
/TT1 1 Tf
|
98
|
+
0 Tc 7.98 0 0 7.98 42.54 41.2403 Tm
|
99
|
+
( )Tj
|
100
|
+
/TT2 1 Tf
|
101
|
+
9 0 0 9 42.54 31.0403 Tm
|
102
|
+
( )Tj
|
103
|
+
EMC
|
104
|
+
ET
|
105
|
+
EOS
|
106
|
+
result = nil
|
107
|
+
assert_nothing_raised do
|
108
|
+
result = @stream.extract_text_objects(nil, TextState.new).select { |res|
|
109
|
+
res.is_a?(TextState)
|
110
|
+
}
|
111
|
+
end
|
112
|
+
assert_instance_of(Array, result)
|
113
|
+
assert_equal(22, result.size)
|
114
|
+
end
|
115
|
+
def test_extract_nontext_objects
|
116
|
+
src = <<-EOS
|
117
|
+
q Q q 12 12.240058 587.76001 767.76001 re W n /Cs1 cs 0 0 0 sc q 0.23999999
|
118
|
+
0 0 0.23999999 64.800003 38.880058 cm
|
119
|
+
EOS
|
120
|
+
active = Matrix[[1,0,0], [0,1,0], [0,0,1]]
|
121
|
+
stack = [active]
|
122
|
+
expected = Matrix[[0.23999999, 0, 0],
|
123
|
+
[0, 0.23999999, 0],
|
124
|
+
[64.800003, 38.880058, 1]]
|
125
|
+
tmatrix = @stream.extract_nontext_objects(src, active, stack, [])
|
126
|
+
assert_equal(expected, tmatrix)
|
127
|
+
#assert_equal([0.23999999, 0, 0, 0.23999999, 64.800003, 38.880058], tmatrix)
|
128
|
+
end
|
129
|
+
def test_robust_et
|
130
|
+
src = <<-'EOS'
|
131
|
+
BT
|
132
|
+
10 0 0 10 113.0394 341.8156 Tm
|
133
|
+
-0.0002 Tc
|
134
|
+
0.0000 Tw
|
135
|
+
[(Zul.-Nr)91.6(.: )]TJ
|
136
|
+
/F10 1 Tf
|
137
|
+
3.8771 0 TD
|
138
|
+
(55994)Tj
|
139
|
+
/F3 1 Tf
|
140
|
+
8.8787 0 TD
|
141
|
+
-0.0001 Tc
|
142
|
+
[(V)36.8(erkaufskategorie: )]TJ
|
143
|
+
/F10 1 Tf
|
144
|
+
9.1015 0 TD
|
145
|
+
0.0000 Tc
|
146
|
+
(B)Tj
|
147
|
+
/F3 1 Tf
|
148
|
+
3.6544 0 TD
|
149
|
+
-0.0001 Tc
|
150
|
+
-0.0306 Tw
|
151
|
+
[(Index: 20.01.0.)-9563.5(20.03.2003)]TJ
|
152
|
+
-25.5118 -2.2428 TD
|
153
|
+
0.0000 Tw
|
154
|
+
[(Zusammensetzung:)-921.1(01)]TJ
|
155
|
+
8.3 0 0 8.3 226.4252 319.3875 Tm
|
156
|
+
-0.0603 Tw
|
157
|
+
[(BARII CARBONAS D12 8.8)-139.1(%, CALCII IODIDUM D4 14.5)-139.1(%, CISTUS CANADE)]TJ
|
158
|
+
33.6765 0 TD
|
159
|
+
[(NSIS D4 16.5)-139.1(%, CO-)]TJ
|
160
|
+
-33.6765 -1.3661 TD
|
161
|
+
-0.0203 Tw
|
162
|
+
[(NIUM MACULA)73.7(TUM D8 14.7)-139.1(%, ECHINACEA ANGUSTIFOLIA ET \(AUT\) P)36.7(ALLID)]TJ
|
163
|
+
35.2499 0 TD
|
164
|
+
(A spag. Peka D5)Tj
|
165
|
+
-35.2499 -1.3661 TD
|
166
|
+
-0.1016 Tw
|
167
|
+
[(16.5)-139.1(%, JUGLANS REGIA spag. Peka D4 14.5)-139.1(%, SCROPHULARIA NODOSA D)]TJ
|
168
|
+
33.6965 0 TD
|
169
|
+
[(4 14.5)-139.1(%, EXCIPIENS)]TJ
|
170
|
+
-33.6965 -1.3661 TD
|
171
|
+
-0.0306 Tw
|
172
|
+
(ad GLOBULOS.)Tj
|
173
|
+
10 0 0 10 113.0394 271.1969 Tm
|
174
|
+
[(Anwendung:)-5285.2(Bei L)36.8(ymphdr�senentz�ndungen im Hals-Rachenraum)]TJ
|
175
|
+
0 -1.4174 TD
|
176
|
+
-0.0002 Tc
|
177
|
+
0.0000 Tw
|
178
|
+
[(Packung:)-5699.5(01)-305.7(001)-7141.9(10)-567.1(g)-18097.9(B)]TJ
|
179
|
+
T*
|
180
|
+
-0.0001 Tc
|
181
|
+
-0.0306 Tw
|
182
|
+
[(G�ltig bis:)-6647.8(19. M�rz 2008)]TJ
|
183
|
+
EOS
|
184
|
+
assert_nil(Stream::ET_PATTERN.match(src))
|
185
|
+
end
|
186
|
+
def test_et__carriage_return
|
187
|
+
src = <<-EOS
|
188
|
+
BT\r/F2 1 Tf\r6 0 0 6 476.805 591.0707 Tm\r/Cs5 cs 0 0 0 sc\r/GS1 gs\r0.2778 Tc\r0 Tw\r[(03.)-358.6(Jahrgang)]TJ\r0 -1.1667 TD\r(03)Tj\r3 0 0 3 486.8113 586.1957 Tm\r0 Tc\r(e)Tj\r6 0 0 6 492.2965 584.0707 Tm\r0.2779 Tc\r(ann\216e)Tj\r5 0 0 5 476.805 570.5707 Tm\r0.1111 Tc\r-0.0306 Tw\r(ISSN 0026-9212)Tj\r/F4 1 Tf\r14 0 0 14 476.5422 597.3198 Tm\r0 Tw\r(5/2004)Tj\rET\r
|
189
|
+
EOS
|
190
|
+
assert_not_nil(Stream::ET_PATTERN.match(src))
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
end
|
@@ -0,0 +1,315 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# TestTextState -- Rpdf2txt -- 29.11.2002 -- aschrafl@ywesee.com
|
24
|
+
|
25
|
+
$: << File.expand_path('../lib', File.dirname(__FILE__))
|
26
|
+
$: << File.dirname(__FILE__)
|
27
|
+
|
28
|
+
require 'test/unit'
|
29
|
+
require 'rpdf2txt/text_state'
|
30
|
+
require 'rpdf2txt/object'
|
31
|
+
require 'flexmock'
|
32
|
+
|
33
|
+
module Rpdf2txt
|
34
|
+
class TextState
|
35
|
+
attr_accessor :w, :char_spacing
|
36
|
+
end
|
37
|
+
end
|
38
|
+
class TestTextState < Test::Unit::TestCase
|
39
|
+
include FlexMock::TestCase
|
40
|
+
def setup
|
41
|
+
font_src = <<-EOS
|
42
|
+
580 0 obj
|
43
|
+
<<
|
44
|
+
/Type /Font
|
45
|
+
/Subtype /Type1
|
46
|
+
/FirstChar 32
|
47
|
+
/LastChar 240
|
48
|
+
/Widths [ 278 389 500 556 556 1000 722 278 333 333 556 600 278 389 278 278
|
49
|
+
556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
|
50
|
+
800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
|
51
|
+
556 778 611 556 556 722 667 1000 667 667 556 389 278 389 600 500
|
52
|
+
278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
|
53
|
+
611 611 389 444 389 611 556 889 556 556 500 333 222 333 600 278
|
54
|
+
0 0 0 0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 0 0 611
|
55
|
+
0 0 0 0 556 556 0 0 0 0 0 800 0 0 0 278 0 0 278 600 278 278 0 611
|
56
|
+
278 278 278 278 278 0 0 278 0 0 0 0 0 278 0 278 278 0 0 0 278 0
|
57
|
+
0 0 0 0 0 0 0 0 0 0 0 278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
58
|
+
0 0 0 0 0 278 ]
|
59
|
+
/Encoding /MacRomanEncoding
|
60
|
+
/BaseFont /Frutiger-BoldItalic
|
61
|
+
/FontDescriptor 579 0 R
|
62
|
+
>>
|
63
|
+
endobj
|
64
|
+
EOS
|
65
|
+
@text_state = Rpdf2txt::TextState.new('latin1')
|
66
|
+
@font = Rpdf2txt::Font.new(font_src)
|
67
|
+
@text_state.set_font(@font)
|
68
|
+
end
|
69
|
+
def test_set
|
70
|
+
@text_state.set_x("42.7953")
|
71
|
+
@text_state.set_y("670.6528")
|
72
|
+
@text_state.update!
|
73
|
+
assert_equal(427953, (@text_state.x*10000).round)
|
74
|
+
assert_equal(6706528, (@text_state.y*10000).round)
|
75
|
+
end
|
76
|
+
def test_update_td
|
77
|
+
@text_state.set_x("100.1234")
|
78
|
+
@text_state.set_y("200.5678")
|
79
|
+
@text_state.update_x("1.2345")
|
80
|
+
@text_state.update_y("6.7890")
|
81
|
+
@text_state.update!
|
82
|
+
assert_equal(1013579, (@text_state.x*10000).to_i)
|
83
|
+
assert_equal(2073568, (@text_state.y*10000).to_i)
|
84
|
+
@text_state.update_x("-1.2345")
|
85
|
+
@text_state.update_y("-6.7890")
|
86
|
+
@text_state.update!
|
87
|
+
assert_equal(1001234, (@text_state.x*10000).to_i)
|
88
|
+
assert_equal(2005678, (@text_state.y*10000).to_i)
|
89
|
+
end
|
90
|
+
def test_update_tD
|
91
|
+
@text_state.set_x("100.1234")
|
92
|
+
@text_state.set_y("200.5678")
|
93
|
+
@text_state.set_lead("-6.7890")
|
94
|
+
@text_state.update_x("1.2345")
|
95
|
+
@text_state.update_y("6.7890")
|
96
|
+
@text_state.update!
|
97
|
+
assert_equal(-6789.0, @text_state.lead*1000)
|
98
|
+
assert_equal(1013579, (@text_state.x*10000).to_i)
|
99
|
+
assert_equal(2073568, (@text_state.y*10000).to_i)
|
100
|
+
@text_state.set_lead(-1.2345)
|
101
|
+
@text_state.set_lead(6.7890)
|
102
|
+
@text_state.update_x("-1.2345")
|
103
|
+
@text_state.update_y("-6.7890")
|
104
|
+
@text_state.update!
|
105
|
+
assert_equal(6789.0, @text_state.lead*1000)
|
106
|
+
assert_equal(1001234, (@text_state.x*10000).to_i)
|
107
|
+
assert_equal(2005678, (@text_state.y*10000).to_i)
|
108
|
+
end
|
109
|
+
def test_set_lead_tl
|
110
|
+
@text_state.set_lead("1.2345")
|
111
|
+
assert_equal(1234.5, @text_state.lead*1000)
|
112
|
+
@text_state.set_lead("-6.7890")
|
113
|
+
assert_equal(-6789.0, @text_state.lead*1000)
|
114
|
+
end
|
115
|
+
def test_step
|
116
|
+
@text_state.set_lead(-3.4567)
|
117
|
+
@text_state.set_y(200.1234)
|
118
|
+
@text_state.set_x(400.5678)
|
119
|
+
@text_state.set_txt('foo')
|
120
|
+
@text_state.advance_x
|
121
|
+
@text_state.update!
|
122
|
+
assert_equal(2001234, (@text_state.y * 10000).to_i)
|
123
|
+
assert_not_equal(4005678, (@text_state.x * 10000).to_i)
|
124
|
+
@text_state.step
|
125
|
+
@text_state.update!
|
126
|
+
assert_equal(1966667, (@text_state.y * 10000).to_i)
|
127
|
+
assert_equal(4005678, (@text_state.x * 10000).to_i)
|
128
|
+
end
|
129
|
+
def test_compare1
|
130
|
+
@text_state.set_font_size(0.0)
|
131
|
+
@text_state.set_x 100.1234
|
132
|
+
@text_state.set_y 200.5678
|
133
|
+
@text_state.update!
|
134
|
+
text_state = Rpdf2txt::TextState.new
|
135
|
+
text_state.set_font_size(0.0)
|
136
|
+
text_state.set_x 88.9012
|
137
|
+
text_state.set_y 250.3456
|
138
|
+
text_state.update!
|
139
|
+
assert(text_state > @text_state, text_state <=> @text_state)
|
140
|
+
end
|
141
|
+
def test_compare2
|
142
|
+
@text_state.set_font_size(10)
|
143
|
+
@text_state.set_x 100.1234
|
144
|
+
@text_state.set_y 200.5678
|
145
|
+
@text_state.update!
|
146
|
+
text_state = Rpdf2txt::TextState.new
|
147
|
+
text_state.set_font_size(10)
|
148
|
+
text_state.set_x 88.9012
|
149
|
+
text_state.set_y 200.5678
|
150
|
+
text_state.update!
|
151
|
+
assert(text_state < @text_state, text_state <=> @text_state)
|
152
|
+
end
|
153
|
+
def test_same_word
|
154
|
+
(p1 = Rpdf2txt::TextState.new).set_x -10000
|
155
|
+
(p1 = Rpdf2txt::TextState.new).set_y 10000
|
156
|
+
(p2 = Rpdf2txt::TextState.new).set_x -5000
|
157
|
+
(p2 = Rpdf2txt::TextState.new).set_y 10000
|
158
|
+
(p3 = Rpdf2txt::TextState.new).set_x 5000
|
159
|
+
(p3 = Rpdf2txt::TextState.new).set_y 10000
|
160
|
+
(p4 = Rpdf2txt::TextState.new).set_x 10000
|
161
|
+
(p4 = Rpdf2txt::TextState.new).set_y 10000
|
162
|
+
p1.set_font_size(10)
|
163
|
+
p2.set_font_size(10)
|
164
|
+
p3.set_font_size(10)
|
165
|
+
p4.set_font_size(10)
|
166
|
+
p1.update!
|
167
|
+
p2.update!
|
168
|
+
p3.update!
|
169
|
+
p4.update!
|
170
|
+
assert_equal(true, p1.same_word(p1))
|
171
|
+
assert_equal(true, p1.same_word(p2))
|
172
|
+
assert_equal(true, p1.same_word(p3))
|
173
|
+
assert_equal(true, p1.same_word(p4))
|
174
|
+
assert_equal(true, p2.same_word(p2))
|
175
|
+
assert_equal(true, p2.same_word(p3))
|
176
|
+
assert_equal(true, p2.same_word(p4))
|
177
|
+
assert_equal(true, p3.same_word(p3))
|
178
|
+
assert_equal(true, p3.same_word(p4))
|
179
|
+
assert_equal(true, p4.same_word(p4))
|
180
|
+
p1.set_y -10
|
181
|
+
p1.update!
|
182
|
+
assert_equal(true, p1.same_word(p1))
|
183
|
+
assert_equal(false, p1.same_word(p2))
|
184
|
+
assert_equal(false, p1.same_word(p3))
|
185
|
+
assert_equal(false, p1.same_word(p4))
|
186
|
+
end
|
187
|
+
def test_same_line
|
188
|
+
ts1 = Rpdf2txt::TextState.new
|
189
|
+
ts1.set_y(210)
|
190
|
+
ts1.set_font_size(10)
|
191
|
+
ts1.update!
|
192
|
+
assert_equal(210.000, ts1.y)
|
193
|
+
assert_equal(200.000, ts1.y2)
|
194
|
+
|
195
|
+
ts2 = Rpdf2txt::TextState.new
|
196
|
+
ts2.set_y(200)
|
197
|
+
ts2.set_font_size(10)
|
198
|
+
ts2.update!
|
199
|
+
assert_equal(200.000, ts2.y)
|
200
|
+
assert_equal(190.000, ts2.y2)
|
201
|
+
|
202
|
+
# -----
|
203
|
+
# ts1
|
204
|
+
# ----- ----- => not same line
|
205
|
+
# ts2
|
206
|
+
# -----
|
207
|
+
assert_equal(false, ts1.same_line(ts2))
|
208
|
+
assert_equal(false, ts2.same_line(ts1))
|
209
|
+
|
210
|
+
ts3 = Rpdf2txt::TextState.new
|
211
|
+
ts3.set_y(205)
|
212
|
+
ts3.set_font_size(10)
|
213
|
+
ts3.update!
|
214
|
+
assert_equal(205.000, ts3.y)
|
215
|
+
assert_equal(195.000, ts3.y2)
|
216
|
+
|
217
|
+
# -----
|
218
|
+
# ts1 -----
|
219
|
+
# ----- ts3 => same line
|
220
|
+
# -----
|
221
|
+
assert_equal(true, ts1.same_line(ts3))
|
222
|
+
assert_equal(true, ts3.same_line(ts1))
|
223
|
+
|
224
|
+
# -----
|
225
|
+
# ----- ts3 => same line
|
226
|
+
# ts2 -----
|
227
|
+
# -----
|
228
|
+
assert_equal(true, ts2.same_line(ts3))
|
229
|
+
assert_equal(true, ts3.same_line(ts2))
|
230
|
+
|
231
|
+
ts4 = Rpdf2txt::TextState.new
|
232
|
+
ts4.set_y(210)
|
233
|
+
ts4.set_font_size(30)
|
234
|
+
ts4.update!
|
235
|
+
assert_equal(210.000, ts4.y)
|
236
|
+
assert_equal(180.000, ts4.y2)
|
237
|
+
|
238
|
+
# -----
|
239
|
+
# -----
|
240
|
+
# ts1 ts4 => same line
|
241
|
+
# -----
|
242
|
+
# -----
|
243
|
+
assert_equal(true, ts1.same_line(ts4))
|
244
|
+
assert_equal(true, ts4.same_line(ts1))
|
245
|
+
end
|
246
|
+
def test_set_txt
|
247
|
+
@text_state.set_txt("Hello World")
|
248
|
+
assert_in_delta(5.612, @text_state.w, 0.001)
|
249
|
+
end
|
250
|
+
def test_set_char_spacing
|
251
|
+
assert_equal(0, @text_state.char_spacing)
|
252
|
+
@text_state.set_char_spacing('-0.456 Tc')
|
253
|
+
assert_equal(-456, @text_state.char_spacing)
|
254
|
+
@text_state.set_char_spacing('0.789 Tc')
|
255
|
+
assert_equal(789, @text_state.char_spacing)
|
256
|
+
end
|
257
|
+
def test_same_word2
|
258
|
+
(p1 = Rpdf2txt::TextState.new).set_x -10000
|
259
|
+
(p1 = Rpdf2txt::TextState.new).set_y 10000
|
260
|
+
(p2 = Rpdf2txt::TextState.new).set_x -5000
|
261
|
+
(p2 = Rpdf2txt::TextState.new).set_y 10000
|
262
|
+
(p3 = Rpdf2txt::TextState.new).set_x 5000
|
263
|
+
(p3 = Rpdf2txt::TextState.new).set_y 10000
|
264
|
+
(p4 = Rpdf2txt::TextState.new).set_x 10000
|
265
|
+
(p4 = Rpdf2txt::TextState.new).set_y 10000
|
266
|
+
p1.set_font_size(10.0)
|
267
|
+
p2.set_font_size(10.0)
|
268
|
+
p3.set_font_size(10.0)
|
269
|
+
p4.set_font_size(10.0)
|
270
|
+
chars = %w(a b c)
|
271
|
+
[p1,p2,p3,p4].each { |text_state|
|
272
|
+
text_state.set_font(@font)
|
273
|
+
text_state.set_txt(chars.join)
|
274
|
+
text_state.update!
|
275
|
+
chars = chars.collect { |char| char.next }
|
276
|
+
}
|
277
|
+
assert_equal(true, p1.same_word(p1))
|
278
|
+
assert_equal(true, p1.same_word(p2))
|
279
|
+
assert_equal(true, p1.same_word(p3))
|
280
|
+
assert_equal(true, p1.same_word(p4))
|
281
|
+
assert_equal(true, p2.same_word(p2))
|
282
|
+
assert_equal(true, p2.same_word(p3))
|
283
|
+
assert_equal(true, p2.same_word(p4))
|
284
|
+
assert_equal(true, p3.same_word(p3))
|
285
|
+
assert_equal(true, p3.same_word(p4))
|
286
|
+
assert_equal(true, p4.same_word(p4))
|
287
|
+
p1.set_y -10
|
288
|
+
p1.update!
|
289
|
+
assert_equal(true, p1.same_word(p1))
|
290
|
+
assert_equal(false, p1.same_word(p2))
|
291
|
+
assert_equal(false, p1.same_word(p3))
|
292
|
+
assert_equal(false, p1.same_word(p4))
|
293
|
+
end
|
294
|
+
def test_char_width
|
295
|
+
assert_equal(0.556, @text_state.char_width('a'))
|
296
|
+
assert_equal(0.278, @text_state.char_width(' '))
|
297
|
+
@text_state.set_char_spacing('0.023')
|
298
|
+
assert_equal(0.579, @text_state.char_width('a'))
|
299
|
+
assert_equal(0.301, @text_state.char_width(' '))
|
300
|
+
@text_state.set_word_spacing('0.012')
|
301
|
+
assert_equal(0.579, @text_state.char_width('a'))
|
302
|
+
assert_equal(0.313, @text_state.char_width(' '))
|
303
|
+
end
|
304
|
+
def test_txt
|
305
|
+
font = flexmock('font')
|
306
|
+
input = "Anwendung: Bei nerv�sen Herzbeschwerden"
|
307
|
+
font.should_receive(:encoding).and_return('mac')
|
308
|
+
font.should_receive(:attributes).and_return({})
|
309
|
+
font.should_ignore_missing
|
310
|
+
@text_state.set_font(font)
|
311
|
+
@text_state.set_txt(input)
|
312
|
+
expected = "Anwendung: Bei nerv�sen Herzbeschwerden"
|
313
|
+
assert_equal(expected, @text_state.txt)
|
314
|
+
end
|
315
|
+
end
|