rpdf2txt 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# TestSpaceBug052004 -- rpdf2txt -- 28.05.2004 -- hwyss@ywesee.com
|
3
|
+
|
4
|
+
$: << File.dirname(__FILE__)
|
5
|
+
$: << File.expand_path('../lib', File.dirname(__FILE__))
|
6
|
+
|
7
|
+
require 'test/unit'
|
8
|
+
require 'rpdf2txt/parser'
|
9
|
+
|
10
|
+
module Rpdf2txt
|
11
|
+
class TestSpaceBug052004 < Test::Unit::TestCase
|
12
|
+
class FontDonor
|
13
|
+
attr_accessor :fonts, :attributes
|
14
|
+
def initialize
|
15
|
+
@attributes = {}
|
16
|
+
end
|
17
|
+
def font(key)
|
18
|
+
@fonts[key]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
def test_same_word
|
22
|
+
font3_src = <<-EOS
|
23
|
+
7 0 obj
|
24
|
+
<<
|
25
|
+
/Type /Font
|
26
|
+
/Subtype /Type1
|
27
|
+
/FirstChar 32
|
28
|
+
/LastChar 240
|
29
|
+
/Widths [ 278 389 556 556 556 1000 722 278 333 333 556 600 278 333 278 278
|
30
|
+
556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
|
31
|
+
800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
|
32
|
+
556 778 611 500 556 722 667 1000 667 667 556 333 278 333 600 500
|
33
|
+
278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
|
34
|
+
611 611 389 389 389 611 500 833 500 500 500 333 222 333 600 278
|
35
|
+
722 0 0 0 0 0 722 0 556 556 556 0 0 0 556 556 556 0 0 0 278 278
|
36
|
+
0 0 0 611 611 0 0 611 611 611 0 400 556 556 0 0 0 0 0 800 0 0 0
|
37
|
+
278 0 0 278 600 278 278 0 611 278 278 278 278 278 0 0 278 0 0 0
|
38
|
+
0 0 278 0 278 278 556 556 0 278 0 0 0 0 0 500 0 556 0 0 278 0 278
|
39
|
+
0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 278 ]
|
40
|
+
/Encoding /MacRomanEncoding
|
41
|
+
/BaseFont /Frutiger-Roman
|
42
|
+
/FontDescriptor 381 0 R
|
43
|
+
>>
|
44
|
+
endobj
|
45
|
+
EOS
|
46
|
+
font30_src = <<-EOS
|
47
|
+
74 0 obj
|
48
|
+
<<
|
49
|
+
/Type /Font
|
50
|
+
/Subtype /Type1
|
51
|
+
/FirstChar 32
|
52
|
+
/LastChar 181
|
53
|
+
/Widths [ 278 389 556 556 556 1000 722 278 333 333 556 600 278 333 278 278
|
54
|
+
556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
|
55
|
+
800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
|
56
|
+
556 778 611 500 556 722 667 1000 667 667 556 333 278 333 600 500
|
57
|
+
278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
|
58
|
+
611 611 389 389 389 611 500 833 500 500 500 333 222 333 600 278
|
59
|
+
278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
|
60
|
+
278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
|
61
|
+
278 278 556 556 278 278 278 278 278 800 278 278 278 278 278 278
|
62
|
+
278 600 278 278 278 611 ]
|
63
|
+
/Encoding /WinAnsiEncoding
|
64
|
+
/BaseFont /Frutiger-Roman
|
65
|
+
/FontDescriptor 381 0 R
|
66
|
+
>>
|
67
|
+
endobj
|
68
|
+
EOS
|
69
|
+
font3 = Font.new(font3_src) # MacRoman Encoded, used for umlauts
|
70
|
+
font30 = Font.new(font30_src) # WinAnsi Encoded
|
71
|
+
path = File.expand_path('data/space_bug_stream.txt',
|
72
|
+
File.dirname(__FILE__))
|
73
|
+
stream = Stream.new(File.read(path))
|
74
|
+
page = FontDonor.new
|
75
|
+
page.fonts = {
|
76
|
+
:f3 => font3,
|
77
|
+
:f30 => font30,
|
78
|
+
}
|
79
|
+
snippets = []
|
80
|
+
snippets = stream.extract_text_objects(page, TextState.new)
|
81
|
+
sorted = snippets.sort
|
82
|
+
g_char = sorted[-4]
|
83
|
+
uml_char = sorted[-3]
|
84
|
+
assert_equal(true, uml_char.same_word(g_char))
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/test/test_stream.rb
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# TestStream -- rpdf2txt -- 01.06.2005 -- hwyss@ywesee.com
|
3
|
+
|
4
|
+
$: << File.expand_path('../lib', File.dirname(__FILE__))
|
5
|
+
|
6
|
+
require 'test/unit'
|
7
|
+
require 'rpdf2txt/object'
|
8
|
+
|
9
|
+
module Rpdf2txt
|
10
|
+
class Stream < PdfObject
|
11
|
+
attr_writer :decoded_stream, :raw_stream
|
12
|
+
end
|
13
|
+
class TestStream < Test::Unit::TestCase
|
14
|
+
def setup
|
15
|
+
input = "3 0 obj << /Length 12 >> stream\n (Hello World!) endstream endobj"
|
16
|
+
@stream = Rpdf2txt::Stream.new(input)
|
17
|
+
end
|
18
|
+
def test_oneliner
|
19
|
+
src = <<-EOS
|
20
|
+
1 0 0 1 -0.5 -0.5 cm q 455.043 -1.5 m 455.043 12.1 l 452.043 9.1 l 452.043 1.5 l
|
21
|
+
W* n 453.543 -0.5 m 453.543 11.1 l S Q q 455.043 12.1 m -1.5 12.1 l 1.5 9.1 l 452.043
|
22
|
+
9.1 l W* n 454.043 10.6 m -0.5 10.6 l S Q q -1.5 12.1 m -1.5 -1.5 l 1.5 1.5 l 1.5
|
23
|
+
9.1 l W* n 0 11.1 m 0 -0.5 l S Q 1 0 0 1 0 39.866 cm
|
24
|
+
BT /F1 16 Tf 1 0 0 -1 0 14.347 Tm(Ponstan�) Tj 0 0 0 RG ET
|
25
|
+
EOS
|
26
|
+
stream = Stream.new('')
|
27
|
+
stream.instance_variable_set('@decoded_stream', src)
|
28
|
+
snippets = stream.extract_text_objects(nil, TextState.new)
|
29
|
+
assert_equal(4, snippets.size, "no text snippets were found")
|
30
|
+
assert_instance_of(TextState, snippets.last)
|
31
|
+
end
|
32
|
+
def test_decode_raw_stream
|
33
|
+
file = File.expand_path('./data/firststream',
|
34
|
+
File.dirname(__FILE__))
|
35
|
+
deflated = File.read(file)
|
36
|
+
src = "stream\n#{deflated}endstream"
|
37
|
+
stream = Rpdf2txt::Stream.new(src)
|
38
|
+
stream.attributes.store(:filter, '/FlateDecode')
|
39
|
+
file = File.expand_path('./data/test.txt',
|
40
|
+
File.dirname(__FILE__))
|
41
|
+
expected = File.read(file)
|
42
|
+
assert_equal(expected, stream.decode_raw_stream)
|
43
|
+
end
|
44
|
+
def test_raw_stream
|
45
|
+
expected = ' (Hello World!) '
|
46
|
+
assert_equal(expected, @stream.raw_stream)
|
47
|
+
end
|
48
|
+
def test_decoded_stream
|
49
|
+
expected = ' (Hello World!) '
|
50
|
+
assert_equal(expected, @stream.decoded_stream)
|
51
|
+
end
|
52
|
+
def test_decoded_stream2
|
53
|
+
file = File.expand_path('./data/firststream', File.dirname(__FILE__))
|
54
|
+
@stream.raw_stream = File.read(file)
|
55
|
+
@stream.attributes[:filter] = "/FlateDecode"
|
56
|
+
file = File.expand_path('./data/test.txt', File.dirname(__FILE__))
|
57
|
+
expected = File.read(file)
|
58
|
+
assert_equal(expected, @stream.decoded_stream)
|
59
|
+
end
|
60
|
+
def test_extract_text_objects
|
61
|
+
file = File.expand_path('./data/stream.txt', File.dirname(__FILE__))
|
62
|
+
@stream.decoded_stream = File.read(file)
|
63
|
+
result = @stream.extract_text_objects(nil, TextState.new).select { |res|
|
64
|
+
res.is_a?(TextState)
|
65
|
+
}
|
66
|
+
assert_instance_of(Array, result)
|
67
|
+
assert_equal(69, result.size)
|
68
|
+
end
|
69
|
+
def test_extract_text_objects__artifact
|
70
|
+
@stream.decoded_stream = <<-EOS
|
71
|
+
BT
|
72
|
+
/P <</MCID 0 >>BDC
|
73
|
+
/CS0 cs 1 0 0 scn
|
74
|
+
/TT2 1 Tf
|
75
|
+
7.98 0 0 7.98 42.54 718.7603 Tm
|
76
|
+
( )Tj
|
77
|
+
/TT3 1 Tf
|
78
|
+
6.0646 Tc 10.02 0 0 10.02 42.54 21.6803 Tm
|
79
|
+
( )Tj
|
80
|
+
22.635 0 Td
|
81
|
+
( )Tj
|
82
|
+
22.635 0 Td
|
83
|
+
( )Tj
|
84
|
+
0 g
|
85
|
+
/TT2 1 Tf
|
86
|
+
0 Tc 0.0004 Tw 7.98 0 0 7.98 516.84 21.6803 Tm
|
87
|
+
(- Seite 1 / 32 - )Tj
|
88
|
+
EMC
|
89
|
+
/Artifact <</Type /Printing >>BDC
|
90
|
+
/TT0 1 Tf
|
91
|
+
0 Tw -59.436 5.406 Td
|
92
|
+
(1)Tj
|
93
|
+
-0.0007 Tc 0.0049 Tw 7.02 0 0 7.02 46.98 59.3003 Tm
|
94
|
+
[(Siehe Legende a)8(u)3(f der let)8(z)-5(ten)12( Se)8(it)8(e des V)8(erze)8(i)-5(chnis)8(ses )]TJ
|
95
|
+
0.0005 Tc 0 Tw -0.632 -1.299 Td
|
96
|
+
[(29.09)10(.09 )]TJ
|
97
|
+
/TT1 1 Tf
|
98
|
+
0 Tc 7.98 0 0 7.98 42.54 41.2403 Tm
|
99
|
+
( )Tj
|
100
|
+
/TT2 1 Tf
|
101
|
+
9 0 0 9 42.54 31.0403 Tm
|
102
|
+
( )Tj
|
103
|
+
EMC
|
104
|
+
ET
|
105
|
+
EOS
|
106
|
+
result = nil
|
107
|
+
assert_nothing_raised do
|
108
|
+
result = @stream.extract_text_objects(nil, TextState.new).select { |res|
|
109
|
+
res.is_a?(TextState)
|
110
|
+
}
|
111
|
+
end
|
112
|
+
assert_instance_of(Array, result)
|
113
|
+
assert_equal(22, result.size)
|
114
|
+
end
|
115
|
+
def test_extract_nontext_objects
|
116
|
+
src = <<-EOS
|
117
|
+
q Q q 12 12.240058 587.76001 767.76001 re W n /Cs1 cs 0 0 0 sc q 0.23999999
|
118
|
+
0 0 0.23999999 64.800003 38.880058 cm
|
119
|
+
EOS
|
120
|
+
active = Matrix[[1,0,0], [0,1,0], [0,0,1]]
|
121
|
+
stack = [active]
|
122
|
+
expected = Matrix[[0.23999999, 0, 0],
|
123
|
+
[0, 0.23999999, 0],
|
124
|
+
[64.800003, 38.880058, 1]]
|
125
|
+
tmatrix = @stream.extract_nontext_objects(src, active, stack, [])
|
126
|
+
assert_equal(expected, tmatrix)
|
127
|
+
#assert_equal([0.23999999, 0, 0, 0.23999999, 64.800003, 38.880058], tmatrix)
|
128
|
+
end
|
129
|
+
def test_robust_et
|
130
|
+
src = <<-'EOS'
|
131
|
+
BT
|
132
|
+
10 0 0 10 113.0394 341.8156 Tm
|
133
|
+
-0.0002 Tc
|
134
|
+
0.0000 Tw
|
135
|
+
[(Zul.-Nr)91.6(.: )]TJ
|
136
|
+
/F10 1 Tf
|
137
|
+
3.8771 0 TD
|
138
|
+
(55994)Tj
|
139
|
+
/F3 1 Tf
|
140
|
+
8.8787 0 TD
|
141
|
+
-0.0001 Tc
|
142
|
+
[(V)36.8(erkaufskategorie: )]TJ
|
143
|
+
/F10 1 Tf
|
144
|
+
9.1015 0 TD
|
145
|
+
0.0000 Tc
|
146
|
+
(B)Tj
|
147
|
+
/F3 1 Tf
|
148
|
+
3.6544 0 TD
|
149
|
+
-0.0001 Tc
|
150
|
+
-0.0306 Tw
|
151
|
+
[(Index: 20.01.0.)-9563.5(20.03.2003)]TJ
|
152
|
+
-25.5118 -2.2428 TD
|
153
|
+
0.0000 Tw
|
154
|
+
[(Zusammensetzung:)-921.1(01)]TJ
|
155
|
+
8.3 0 0 8.3 226.4252 319.3875 Tm
|
156
|
+
-0.0603 Tw
|
157
|
+
[(BARII CARBONAS D12 8.8)-139.1(%, CALCII IODIDUM D4 14.5)-139.1(%, CISTUS CANADE)]TJ
|
158
|
+
33.6765 0 TD
|
159
|
+
[(NSIS D4 16.5)-139.1(%, CO-)]TJ
|
160
|
+
-33.6765 -1.3661 TD
|
161
|
+
-0.0203 Tw
|
162
|
+
[(NIUM MACULA)73.7(TUM D8 14.7)-139.1(%, ECHINACEA ANGUSTIFOLIA ET \(AUT\) P)36.7(ALLID)]TJ
|
163
|
+
35.2499 0 TD
|
164
|
+
(A spag. Peka D5)Tj
|
165
|
+
-35.2499 -1.3661 TD
|
166
|
+
-0.1016 Tw
|
167
|
+
[(16.5)-139.1(%, JUGLANS REGIA spag. Peka D4 14.5)-139.1(%, SCROPHULARIA NODOSA D)]TJ
|
168
|
+
33.6965 0 TD
|
169
|
+
[(4 14.5)-139.1(%, EXCIPIENS)]TJ
|
170
|
+
-33.6965 -1.3661 TD
|
171
|
+
-0.0306 Tw
|
172
|
+
(ad GLOBULOS.)Tj
|
173
|
+
10 0 0 10 113.0394 271.1969 Tm
|
174
|
+
[(Anwendung:)-5285.2(Bei L)36.8(ymphdr�senentz�ndungen im Hals-Rachenraum)]TJ
|
175
|
+
0 -1.4174 TD
|
176
|
+
-0.0002 Tc
|
177
|
+
0.0000 Tw
|
178
|
+
[(Packung:)-5699.5(01)-305.7(001)-7141.9(10)-567.1(g)-18097.9(B)]TJ
|
179
|
+
T*
|
180
|
+
-0.0001 Tc
|
181
|
+
-0.0306 Tw
|
182
|
+
[(G�ltig bis:)-6647.8(19. M�rz 2008)]TJ
|
183
|
+
EOS
|
184
|
+
assert_nil(Stream::ET_PATTERN.match(src))
|
185
|
+
end
|
186
|
+
def test_et__carriage_return
|
187
|
+
src = <<-EOS
|
188
|
+
BT\r/F2 1 Tf\r6 0 0 6 476.805 591.0707 Tm\r/Cs5 cs 0 0 0 sc\r/GS1 gs\r0.2778 Tc\r0 Tw\r[(03.)-358.6(Jahrgang)]TJ\r0 -1.1667 TD\r(03)Tj\r3 0 0 3 486.8113 586.1957 Tm\r0 Tc\r(e)Tj\r6 0 0 6 492.2965 584.0707 Tm\r0.2779 Tc\r(ann\216e)Tj\r5 0 0 5 476.805 570.5707 Tm\r0.1111 Tc\r-0.0306 Tw\r(ISSN 0026-9212)Tj\r/F4 1 Tf\r14 0 0 14 476.5422 597.3198 Tm\r0 Tw\r(5/2004)Tj\rET\r
|
189
|
+
EOS
|
190
|
+
assert_not_nil(Stream::ET_PATTERN.match(src))
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
end
|
@@ -0,0 +1,315 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# TestTextState -- Rpdf2txt -- 29.11.2002 -- aschrafl@ywesee.com
|
24
|
+
|
25
|
+
$: << File.expand_path('../lib', File.dirname(__FILE__))
|
26
|
+
$: << File.dirname(__FILE__)
|
27
|
+
|
28
|
+
require 'test/unit'
|
29
|
+
require 'rpdf2txt/text_state'
|
30
|
+
require 'rpdf2txt/object'
|
31
|
+
require 'flexmock'
|
32
|
+
|
33
|
+
module Rpdf2txt
|
34
|
+
class TextState
|
35
|
+
attr_accessor :w, :char_spacing
|
36
|
+
end
|
37
|
+
end
|
38
|
+
class TestTextState < Test::Unit::TestCase
|
39
|
+
include FlexMock::TestCase
|
40
|
+
def setup
|
41
|
+
font_src = <<-EOS
|
42
|
+
580 0 obj
|
43
|
+
<<
|
44
|
+
/Type /Font
|
45
|
+
/Subtype /Type1
|
46
|
+
/FirstChar 32
|
47
|
+
/LastChar 240
|
48
|
+
/Widths [ 278 389 500 556 556 1000 722 278 333 333 556 600 278 389 278 278
|
49
|
+
556 556 556 556 556 556 556 556 556 556 278 278 600 600 600 500
|
50
|
+
800 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778
|
51
|
+
556 778 611 556 556 722 667 1000 667 667 556 389 278 389 600 500
|
52
|
+
278 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611
|
53
|
+
611 611 389 444 389 611 556 889 556 556 500 333 222 333 600 278
|
54
|
+
0 0 0 0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 0 0 611
|
55
|
+
0 0 0 0 556 556 0 0 0 0 0 800 0 0 0 278 0 0 278 600 278 278 0 611
|
56
|
+
278 278 278 278 278 0 0 278 0 0 0 0 0 278 0 278 278 0 0 0 278 0
|
57
|
+
0 0 0 0 0 0 0 0 0 0 0 278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
58
|
+
0 0 0 0 0 278 ]
|
59
|
+
/Encoding /MacRomanEncoding
|
60
|
+
/BaseFont /Frutiger-BoldItalic
|
61
|
+
/FontDescriptor 579 0 R
|
62
|
+
>>
|
63
|
+
endobj
|
64
|
+
EOS
|
65
|
+
@text_state = Rpdf2txt::TextState.new('latin1')
|
66
|
+
@font = Rpdf2txt::Font.new(font_src)
|
67
|
+
@text_state.set_font(@font)
|
68
|
+
end
|
69
|
+
def test_set
|
70
|
+
@text_state.set_x("42.7953")
|
71
|
+
@text_state.set_y("670.6528")
|
72
|
+
@text_state.update!
|
73
|
+
assert_equal(427953, (@text_state.x*10000).round)
|
74
|
+
assert_equal(6706528, (@text_state.y*10000).round)
|
75
|
+
end
|
76
|
+
def test_update_td
|
77
|
+
@text_state.set_x("100.1234")
|
78
|
+
@text_state.set_y("200.5678")
|
79
|
+
@text_state.update_x("1.2345")
|
80
|
+
@text_state.update_y("6.7890")
|
81
|
+
@text_state.update!
|
82
|
+
assert_equal(1013579, (@text_state.x*10000).to_i)
|
83
|
+
assert_equal(2073568, (@text_state.y*10000).to_i)
|
84
|
+
@text_state.update_x("-1.2345")
|
85
|
+
@text_state.update_y("-6.7890")
|
86
|
+
@text_state.update!
|
87
|
+
assert_equal(1001234, (@text_state.x*10000).to_i)
|
88
|
+
assert_equal(2005678, (@text_state.y*10000).to_i)
|
89
|
+
end
|
90
|
+
def test_update_tD
|
91
|
+
@text_state.set_x("100.1234")
|
92
|
+
@text_state.set_y("200.5678")
|
93
|
+
@text_state.set_lead("-6.7890")
|
94
|
+
@text_state.update_x("1.2345")
|
95
|
+
@text_state.update_y("6.7890")
|
96
|
+
@text_state.update!
|
97
|
+
assert_equal(-6789.0, @text_state.lead*1000)
|
98
|
+
assert_equal(1013579, (@text_state.x*10000).to_i)
|
99
|
+
assert_equal(2073568, (@text_state.y*10000).to_i)
|
100
|
+
@text_state.set_lead(-1.2345)
|
101
|
+
@text_state.set_lead(6.7890)
|
102
|
+
@text_state.update_x("-1.2345")
|
103
|
+
@text_state.update_y("-6.7890")
|
104
|
+
@text_state.update!
|
105
|
+
assert_equal(6789.0, @text_state.lead*1000)
|
106
|
+
assert_equal(1001234, (@text_state.x*10000).to_i)
|
107
|
+
assert_equal(2005678, (@text_state.y*10000).to_i)
|
108
|
+
end
|
109
|
+
def test_set_lead_tl
|
110
|
+
@text_state.set_lead("1.2345")
|
111
|
+
assert_equal(1234.5, @text_state.lead*1000)
|
112
|
+
@text_state.set_lead("-6.7890")
|
113
|
+
assert_equal(-6789.0, @text_state.lead*1000)
|
114
|
+
end
|
115
|
+
def test_step
|
116
|
+
@text_state.set_lead(-3.4567)
|
117
|
+
@text_state.set_y(200.1234)
|
118
|
+
@text_state.set_x(400.5678)
|
119
|
+
@text_state.set_txt('foo')
|
120
|
+
@text_state.advance_x
|
121
|
+
@text_state.update!
|
122
|
+
assert_equal(2001234, (@text_state.y * 10000).to_i)
|
123
|
+
assert_not_equal(4005678, (@text_state.x * 10000).to_i)
|
124
|
+
@text_state.step
|
125
|
+
@text_state.update!
|
126
|
+
assert_equal(1966667, (@text_state.y * 10000).to_i)
|
127
|
+
assert_equal(4005678, (@text_state.x * 10000).to_i)
|
128
|
+
end
|
129
|
+
def test_compare1
|
130
|
+
@text_state.set_font_size(0.0)
|
131
|
+
@text_state.set_x 100.1234
|
132
|
+
@text_state.set_y 200.5678
|
133
|
+
@text_state.update!
|
134
|
+
text_state = Rpdf2txt::TextState.new
|
135
|
+
text_state.set_font_size(0.0)
|
136
|
+
text_state.set_x 88.9012
|
137
|
+
text_state.set_y 250.3456
|
138
|
+
text_state.update!
|
139
|
+
assert(text_state > @text_state, text_state <=> @text_state)
|
140
|
+
end
|
141
|
+
def test_compare2
|
142
|
+
@text_state.set_font_size(10)
|
143
|
+
@text_state.set_x 100.1234
|
144
|
+
@text_state.set_y 200.5678
|
145
|
+
@text_state.update!
|
146
|
+
text_state = Rpdf2txt::TextState.new
|
147
|
+
text_state.set_font_size(10)
|
148
|
+
text_state.set_x 88.9012
|
149
|
+
text_state.set_y 200.5678
|
150
|
+
text_state.update!
|
151
|
+
assert(text_state < @text_state, text_state <=> @text_state)
|
152
|
+
end
|
153
|
+
def test_same_word
|
154
|
+
(p1 = Rpdf2txt::TextState.new).set_x -10000
|
155
|
+
(p1 = Rpdf2txt::TextState.new).set_y 10000
|
156
|
+
(p2 = Rpdf2txt::TextState.new).set_x -5000
|
157
|
+
(p2 = Rpdf2txt::TextState.new).set_y 10000
|
158
|
+
(p3 = Rpdf2txt::TextState.new).set_x 5000
|
159
|
+
(p3 = Rpdf2txt::TextState.new).set_y 10000
|
160
|
+
(p4 = Rpdf2txt::TextState.new).set_x 10000
|
161
|
+
(p4 = Rpdf2txt::TextState.new).set_y 10000
|
162
|
+
p1.set_font_size(10)
|
163
|
+
p2.set_font_size(10)
|
164
|
+
p3.set_font_size(10)
|
165
|
+
p4.set_font_size(10)
|
166
|
+
p1.update!
|
167
|
+
p2.update!
|
168
|
+
p3.update!
|
169
|
+
p4.update!
|
170
|
+
assert_equal(true, p1.same_word(p1))
|
171
|
+
assert_equal(true, p1.same_word(p2))
|
172
|
+
assert_equal(true, p1.same_word(p3))
|
173
|
+
assert_equal(true, p1.same_word(p4))
|
174
|
+
assert_equal(true, p2.same_word(p2))
|
175
|
+
assert_equal(true, p2.same_word(p3))
|
176
|
+
assert_equal(true, p2.same_word(p4))
|
177
|
+
assert_equal(true, p3.same_word(p3))
|
178
|
+
assert_equal(true, p3.same_word(p4))
|
179
|
+
assert_equal(true, p4.same_word(p4))
|
180
|
+
p1.set_y -10
|
181
|
+
p1.update!
|
182
|
+
assert_equal(true, p1.same_word(p1))
|
183
|
+
assert_equal(false, p1.same_word(p2))
|
184
|
+
assert_equal(false, p1.same_word(p3))
|
185
|
+
assert_equal(false, p1.same_word(p4))
|
186
|
+
end
|
187
|
+
def test_same_line
|
188
|
+
ts1 = Rpdf2txt::TextState.new
|
189
|
+
ts1.set_y(210)
|
190
|
+
ts1.set_font_size(10)
|
191
|
+
ts1.update!
|
192
|
+
assert_equal(210.000, ts1.y)
|
193
|
+
assert_equal(200.000, ts1.y2)
|
194
|
+
|
195
|
+
ts2 = Rpdf2txt::TextState.new
|
196
|
+
ts2.set_y(200)
|
197
|
+
ts2.set_font_size(10)
|
198
|
+
ts2.update!
|
199
|
+
assert_equal(200.000, ts2.y)
|
200
|
+
assert_equal(190.000, ts2.y2)
|
201
|
+
|
202
|
+
# -----
|
203
|
+
# ts1
|
204
|
+
# ----- ----- => not same line
|
205
|
+
# ts2
|
206
|
+
# -----
|
207
|
+
assert_equal(false, ts1.same_line(ts2))
|
208
|
+
assert_equal(false, ts2.same_line(ts1))
|
209
|
+
|
210
|
+
ts3 = Rpdf2txt::TextState.new
|
211
|
+
ts3.set_y(205)
|
212
|
+
ts3.set_font_size(10)
|
213
|
+
ts3.update!
|
214
|
+
assert_equal(205.000, ts3.y)
|
215
|
+
assert_equal(195.000, ts3.y2)
|
216
|
+
|
217
|
+
# -----
|
218
|
+
# ts1 -----
|
219
|
+
# ----- ts3 => same line
|
220
|
+
# -----
|
221
|
+
assert_equal(true, ts1.same_line(ts3))
|
222
|
+
assert_equal(true, ts3.same_line(ts1))
|
223
|
+
|
224
|
+
# -----
|
225
|
+
# ----- ts3 => same line
|
226
|
+
# ts2 -----
|
227
|
+
# -----
|
228
|
+
assert_equal(true, ts2.same_line(ts3))
|
229
|
+
assert_equal(true, ts3.same_line(ts2))
|
230
|
+
|
231
|
+
ts4 = Rpdf2txt::TextState.new
|
232
|
+
ts4.set_y(210)
|
233
|
+
ts4.set_font_size(30)
|
234
|
+
ts4.update!
|
235
|
+
assert_equal(210.000, ts4.y)
|
236
|
+
assert_equal(180.000, ts4.y2)
|
237
|
+
|
238
|
+
# -----
|
239
|
+
# -----
|
240
|
+
# ts1 ts4 => same line
|
241
|
+
# -----
|
242
|
+
# -----
|
243
|
+
assert_equal(true, ts1.same_line(ts4))
|
244
|
+
assert_equal(true, ts4.same_line(ts1))
|
245
|
+
end
|
246
|
+
def test_set_txt
|
247
|
+
@text_state.set_txt("Hello World")
|
248
|
+
assert_in_delta(5.612, @text_state.w, 0.001)
|
249
|
+
end
|
250
|
+
def test_set_char_spacing
|
251
|
+
assert_equal(0, @text_state.char_spacing)
|
252
|
+
@text_state.set_char_spacing('-0.456 Tc')
|
253
|
+
assert_equal(-456, @text_state.char_spacing)
|
254
|
+
@text_state.set_char_spacing('0.789 Tc')
|
255
|
+
assert_equal(789, @text_state.char_spacing)
|
256
|
+
end
|
257
|
+
def test_same_word2
|
258
|
+
(p1 = Rpdf2txt::TextState.new).set_x -10000
|
259
|
+
(p1 = Rpdf2txt::TextState.new).set_y 10000
|
260
|
+
(p2 = Rpdf2txt::TextState.new).set_x -5000
|
261
|
+
(p2 = Rpdf2txt::TextState.new).set_y 10000
|
262
|
+
(p3 = Rpdf2txt::TextState.new).set_x 5000
|
263
|
+
(p3 = Rpdf2txt::TextState.new).set_y 10000
|
264
|
+
(p4 = Rpdf2txt::TextState.new).set_x 10000
|
265
|
+
(p4 = Rpdf2txt::TextState.new).set_y 10000
|
266
|
+
p1.set_font_size(10.0)
|
267
|
+
p2.set_font_size(10.0)
|
268
|
+
p3.set_font_size(10.0)
|
269
|
+
p4.set_font_size(10.0)
|
270
|
+
chars = %w(a b c)
|
271
|
+
[p1,p2,p3,p4].each { |text_state|
|
272
|
+
text_state.set_font(@font)
|
273
|
+
text_state.set_txt(chars.join)
|
274
|
+
text_state.update!
|
275
|
+
chars = chars.collect { |char| char.next }
|
276
|
+
}
|
277
|
+
assert_equal(true, p1.same_word(p1))
|
278
|
+
assert_equal(true, p1.same_word(p2))
|
279
|
+
assert_equal(true, p1.same_word(p3))
|
280
|
+
assert_equal(true, p1.same_word(p4))
|
281
|
+
assert_equal(true, p2.same_word(p2))
|
282
|
+
assert_equal(true, p2.same_word(p3))
|
283
|
+
assert_equal(true, p2.same_word(p4))
|
284
|
+
assert_equal(true, p3.same_word(p3))
|
285
|
+
assert_equal(true, p3.same_word(p4))
|
286
|
+
assert_equal(true, p4.same_word(p4))
|
287
|
+
p1.set_y -10
|
288
|
+
p1.update!
|
289
|
+
assert_equal(true, p1.same_word(p1))
|
290
|
+
assert_equal(false, p1.same_word(p2))
|
291
|
+
assert_equal(false, p1.same_word(p3))
|
292
|
+
assert_equal(false, p1.same_word(p4))
|
293
|
+
end
|
294
|
+
def test_char_width
|
295
|
+
assert_equal(0.556, @text_state.char_width('a'))
|
296
|
+
assert_equal(0.278, @text_state.char_width(' '))
|
297
|
+
@text_state.set_char_spacing('0.023')
|
298
|
+
assert_equal(0.579, @text_state.char_width('a'))
|
299
|
+
assert_equal(0.301, @text_state.char_width(' '))
|
300
|
+
@text_state.set_word_spacing('0.012')
|
301
|
+
assert_equal(0.579, @text_state.char_width('a'))
|
302
|
+
assert_equal(0.313, @text_state.char_width(' '))
|
303
|
+
end
|
304
|
+
def test_txt
|
305
|
+
font = flexmock('font')
|
306
|
+
input = "Anwendung: Bei nerv�sen Herzbeschwerden"
|
307
|
+
font.should_receive(:encoding).and_return('mac')
|
308
|
+
font.should_receive(:attributes).and_return({})
|
309
|
+
font.should_ignore_missing
|
310
|
+
@text_state.set_font(font)
|
311
|
+
@text_state.set_txt(input)
|
312
|
+
expected = "Anwendung: Bei nerv�sen Herzbeschwerden"
|
313
|
+
assert_equal(expected, @text_state.txt)
|
314
|
+
end
|
315
|
+
end
|