rwv2 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +515 -0
- data/History.txt +5 -0
- data/INSTALL +112 -0
- data/InstalledFiles +3 -0
- data/Manifest.txt +42 -0
- data/README +92 -0
- data/README.txt +73 -0
- data/Rakefile +28 -0
- data/config.save +12 -0
- data/ext/rwv2/MANIFEST +9 -0
- data/ext/rwv2/Makefile +150 -0
- data/ext/rwv2/extconf.rb +33 -0
- data/ext/rwv2/include/rwv2.h +42 -0
- data/ext/rwv2/include/rwv2_associated_strings.h +40 -0
- data/ext/rwv2/include/rwv2_handlers.h +129 -0
- data/ext/rwv2/include/rwv2_parser.h +58 -0
- data/ext/rwv2/include/rwv2_properties.h +149 -0
- data/ext/rwv2/mkmf.log +12 -0
- data/ext/rwv2/rwv2.cpp +294 -0
- data/ext/rwv2/rwv2.o +0 -0
- data/ext/rwv2/rwv2.so +0 -0
- data/ext/rwv2/rwv2_handlers.cpp +81 -0
- data/ext/rwv2/rwv2_handlers.o +0 -0
- data/ext/rwv2/rwv2_parser.cpp +76 -0
- data/ext/rwv2/rwv2_parser.o +0 -0
- data/ext/rwv2/rwv2_properties.cpp +218 -0
- data/ext/rwv2/rwv2_properties.o +0 -0
- data/install.rb +1098 -0
- data/lib/rwv2/handlers.rb +52 -0
- data/lib/rwv2/rwv2.rb +28 -0
- data/rwv2-0.2.3.patch +223 -0
- data/test/data/not_a_word_document.rtf +16 -0
- data/test/data/test.doc +0 -0
- data/test/data/test2.doc +0 -0
- data/test/data/test3.doc +0 -0
- data/test/data/test4.doc +0 -0
- data/test/data/test5.doc +0 -0
- data/test/data/test6.doc +0 -0
- data/test/data/test7.doc +0 -0
- data/test/data/test8.doc +0 -0
- data/test/data/test9.doc +0 -0
- data/test/test_parser.rb +644 -0
- metadata +130 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rwv2 -- Microsoft Word Parser extension
|
4
|
+
# Copyright (C) 2003 Hannes Wyss, ywesee - intellectual capital connected
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected
|
21
|
+
# Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
22
|
+
# hwyss@ywesee.com
|
23
|
+
#
|
24
|
+
# Rwv2 -- Rwv2 -- 21.8.2003 -- hwyss@ywesee.com
|
25
|
+
|
26
|
+
module Rwv2
|
27
|
+
class SubDocumentHandler
|
28
|
+
def body_start; end
|
29
|
+
def body_end; end
|
30
|
+
def footnote_start; end
|
31
|
+
def footnote_end; end
|
32
|
+
def headers_start; end
|
33
|
+
def headers_end; end
|
34
|
+
def header_start(header_data); end
|
35
|
+
def header_end; end
|
36
|
+
end
|
37
|
+
class TableHandler
|
38
|
+
def row_start(table_properties); end
|
39
|
+
def row_end; end
|
40
|
+
def cell_start; end
|
41
|
+
def cell_end; end
|
42
|
+
end
|
43
|
+
class TextHandler
|
44
|
+
def section_start(section_properties); end
|
45
|
+
def section_end; end
|
46
|
+
def page_break; end
|
47
|
+
def paragraph_start(paragraph_properties); end
|
48
|
+
def paragraph_end; end
|
49
|
+
def run_of_text(text, character_properties); end
|
50
|
+
def picture(pict); end
|
51
|
+
end
|
52
|
+
end
|
data/lib/rwv2/rwv2.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rwv2 -- Microsoft Word Parser extension
|
4
|
+
# Copyright (C) 2003 Hannes Wyss, ywesee - intellectual capital connected
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected
|
21
|
+
# Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
22
|
+
# hwyss@ywesee.com
|
23
|
+
## rwv2 -- rwv2 -- 24.09.2003 -- rwaltert@ywesee.com
|
24
|
+
|
25
|
+
require 'rwv2'
|
26
|
+
require 'rwv2/handlers'
|
27
|
+
|
28
|
+
VERSION = '0.6.0'
|
data/rwv2-0.2.3.patch
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
diff -Nur wv2-0.2.3/src/handlers.cpp wv2-0.2.3-p1/src/handlers.cpp
|
2
|
+
--- wv2-0.2.3/src/handlers.cpp 2006-06-12 18:40:11.000000000 +0200
|
3
|
+
+++ wv2-0.2.3-p1/src/handlers.cpp 2008-02-18 18:20:26.000000000 +0100
|
4
|
+
@@ -177,6 +177,11 @@
|
5
|
+
{
|
6
|
+
}
|
7
|
+
|
8
|
+
+void TextHandler::pictureData( SharedPtr<const Word97::PICF> /*picf*/,
|
9
|
+
+ U8* /*buffer*/ )
|
10
|
+
+{
|
11
|
+
+}
|
12
|
+
+
|
13
|
+
void TextHandler::tableRowFound( const TableRowFunctor& tableRow, SharedPtr<const Word97::TAP> /*tap*/ )
|
14
|
+
{
|
15
|
+
tableRow();
|
16
|
+
diff -Nur wv2-0.2.3/src/handlers.h wv2-0.2.3-p1/src/handlers.h
|
17
|
+
--- wv2-0.2.3/src/handlers.h 2006-06-12 18:40:11.000000000 +0200
|
18
|
+
+++ wv2-0.2.3-p1/src/handlers.h 2008-02-18 18:20:30.000000000 +0100
|
19
|
+
@@ -251,7 +251,9 @@
|
20
|
+
* Very special characters (bad, bad name) are the ones which need additional
|
21
|
+
* information from the file (i.e. the plain "put the current date there" isn't sufficent).
|
22
|
+
*/
|
23
|
+
- enum VerySpecialCharacter { Picture = 1, FootnoteAuto = 2, FieldBegin = 19, FieldSeparator = 20,
|
24
|
+
+ enum VerySpecialCharacter { Picture = 1, FootnoteAuto = 2,
|
25
|
+
+ Drawing = 8,
|
26
|
+
+ FieldBegin = 19, FieldSeparator = 20,
|
27
|
+
FieldEnd = 21, FieldEscapeChar = 92 };
|
28
|
+
|
29
|
+
/**
|
30
|
+
@@ -299,6 +301,13 @@
|
31
|
+
virtual void fieldEnd( const FLD* fld, SharedPtr<const Word97::CHP> chp );
|
32
|
+
|
33
|
+
/**
|
34
|
+
+ * This method is called every time we find a picture.
|
35
|
+
+ * @param picf the picture-data.
|
36
|
+
+ */
|
37
|
+
+ virtual void pictureData( SharedPtr<const Word97::PICF> picf,
|
38
|
+
+ U8* buffer);
|
39
|
+
+
|
40
|
+
+ /**
|
41
|
+
* This method is called every time we find a table row. The default
|
42
|
+
* implementation invokes the functor, which triggers the parsing
|
43
|
+
* process for the given table row.
|
44
|
+
diff -Nur wv2-0.2.3/src/parser9x.cpp wv2-0.2.3-p1/src/parser9x.cpp
|
45
|
+
--- wv2-0.2.3/src/parser9x.cpp 2006-06-12 18:40:11.000000000 +0200
|
46
|
+
+++ wv2-0.2.3-p1/src/parser9x.cpp 2008-02-20 17:32:03.000000000 +0100
|
47
|
+
@@ -711,6 +711,7 @@
|
48
|
+
break;
|
49
|
+
|
50
|
+
// It has to be one of the very special characters...
|
51
|
+
+ case TextHandler::Drawing:
|
52
|
+
case TextHandler::Picture:
|
53
|
+
emitPictureData( chp );
|
54
|
+
break;
|
55
|
+
@@ -812,7 +813,6 @@
|
56
|
+
picf = new Word97::PICF( Word95::toWord97( Word95::PICF( stream, false ) ) );
|
57
|
+
else
|
58
|
+
picf = new Word97::PICF( stream, false );
|
59
|
+
- stream->pop();
|
60
|
+
|
61
|
+
if ( picf->cbHeader < 58 ) {
|
62
|
+
wvlog << "Error: Found an image with a PICF smaller than 58 bytes! Skipping the image." << std::endl;
|
63
|
+
@@ -838,8 +838,17 @@
|
64
|
+
<< std::endl << " dxaOrigin=" << picf->dxaOrigin << " dyaOrigin="
|
65
|
+
<< picf->dyaOrigin << std::endl;
|
66
|
+
#endif
|
67
|
+
- // for now
|
68
|
+
- delete picf;
|
69
|
+
+
|
70
|
+
+ /* extract the image blob */
|
71
|
+
+ stream->seek( chp->fcPic_fcObj_lTagObj + picf->cbHeader, G_SEEK_SET );
|
72
|
+
+ U32 len = picf->lcb - picf->cbHeader;
|
73
|
+
+ U8* buffer;
|
74
|
+
+ buffer = (U8 *)malloc(len * sizeof(U8));
|
75
|
+
+ stream->read(buffer, len);
|
76
|
+
+ m_textHandler->pictureData( picf, buffer );
|
77
|
+
+ free(buffer);
|
78
|
+
+
|
79
|
+
+ stream->pop();
|
80
|
+
}
|
81
|
+
|
82
|
+
void Parser9x::parseHeader( const HeaderData& data, unsigned char mask )
|
83
|
+
diff -Nur wv2-0.2.3/src/styles.cpp wv2-0.2.3-p1/src/styles.cpp
|
84
|
+
--- wv2-0.2.3/src/styles.cpp 2006-06-12 18:40:11.000000000 +0200
|
85
|
+
+++ wv2-0.2.3-p1/src/styles.cpp 2008-02-21 11:24:07.000000000 +0100
|
86
|
+
@@ -445,6 +445,9 @@
|
87
|
+
parentStyle = stylesheet.styleByIndex( m_std->istdBase );
|
88
|
+
if ( parentStyle ) {
|
89
|
+
const_cast<Style*>( parentStyle )->unwrapStyle( stylesheet, version );
|
90
|
+
+ // I'm getting Segfaults where there is no m_upechpx in parentStyle
|
91
|
+
+ if ( !parentStyle->m_upechpx )
|
92
|
+
+ parentStyle->m_upechpx = new UPECHPX();
|
93
|
+
bool ok;
|
94
|
+
m_upechpx->istd = stylesheet.indexByID( m_std->sti, ok );
|
95
|
+
mergeUpechpx( parentStyle, version );
|
96
|
+
@@ -665,7 +668,7 @@
|
97
|
+
else if ( cbStshi == Word97::STSHI::sizeOf )
|
98
|
+
m_stsh.read( tableStream, false );
|
99
|
+
else {
|
100
|
+
- wvlog << "Detected a different STSHI, check this (trying to read Word97 one)" << std::endl;
|
101
|
+
+ wvlog << "Detected a different STSHI, check this (trying to read Word97 one - probably Latent Style Data added in Word2003)" << std::endl;
|
102
|
+
m_stsh.read( tableStream, false );
|
103
|
+
}
|
104
|
+
|
105
|
+
diff -Nur wv2-0.2.3/src/word97_generated.h wv2-0.2.3-p1/src/word97_generated.h
|
106
|
+
--- wv2-0.2.3/src/word97_generated.h 2006-06-12 18:40:12.000000000 +0200
|
107
|
+
+++ wv2-0.2.3-p1/src/word97_generated.h 2008-02-21 11:03:07.000000000 +0100
|
108
|
+
@@ -8623,6 +8623,18 @@
|
109
|
+
*/
|
110
|
+
U16 rgftcStandardChpStsh[3];
|
111
|
+
|
112
|
+
+
|
113
|
+
+ /** introduced in Word2003 **/
|
114
|
+
+ /**
|
115
|
+
+ * size of each lsd in mpstilsd. The count of lsd's is stiMaxWhenSaved
|
116
|
+
+ */
|
117
|
+
+ //U16 cbLSD;
|
118
|
+
+
|
119
|
+
+ /**
|
120
|
+
+ * latent style data (stiMax == stiMaxWhenSaved upon save!)
|
121
|
+
+ */
|
122
|
+
+ //LSD mpstilsd[3];
|
123
|
+
+
|
124
|
+
}; // STSHI
|
125
|
+
|
126
|
+
bool operator==(const STSHI &lhs, const STSHI &rhs);
|
127
|
+
diff -Nur wv2-0.2.3/src/word97_helper.cpp wv2-0.2.3-p1/src/word97_helper.cpp
|
128
|
+
--- wv2-0.2.3/src/word97_helper.cpp 2006-06-12 18:50:45.000000000 +0200
|
129
|
+
+++ wv2-0.2.3-p1/src/word97_helper.cpp 2008-02-21 11:33:21.000000000 +0100
|
130
|
+
@@ -1137,8 +1137,14 @@
|
131
|
+
fBold = *ptr == 1;
|
132
|
+
else if ( *ptr == 128 && paragraphStyle )
|
133
|
+
fBold = paragraphStyle->chp().fBold;
|
134
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
135
|
+
- fBold = !( paragraphStyle->chp().fBold );
|
136
|
+
+ else if ( *ptr == 129 )
|
137
|
+
+ /**
|
138
|
+
+ * there are some Word-Documents where the Nil Style seems to be
|
139
|
+
+ * defined as Reversed. Obviously the Nil Style has no
|
140
|
+
+ * ParentStyle, which is why I've moved the paragraphStyle-check
|
141
|
+
+ * from the else if clause down to the next line.
|
142
|
+
+ */
|
143
|
+
+ fBold = !( paragraphStyle && paragraphStyle->chp().fBold );
|
144
|
+
else
|
145
|
+
wvlog << "Warning: sprmCFBold couldn't find a style" << std::endl;
|
146
|
+
break;
|
147
|
+
@@ -1147,8 +1153,8 @@
|
148
|
+
fItalic = *ptr == 1;
|
149
|
+
else if ( *ptr == 128 && paragraphStyle )
|
150
|
+
fItalic = paragraphStyle->chp().fItalic;
|
151
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
152
|
+
- fItalic = !( paragraphStyle->chp().fItalic );
|
153
|
+
+ else if ( *ptr == 129 )
|
154
|
+
+ fItalic = !( paragraphStyle && paragraphStyle->chp().fItalic );
|
155
|
+
else
|
156
|
+
wvlog << "Warning: sprmCFItalic couldn't find a style" << std::endl;
|
157
|
+
break;
|
158
|
+
@@ -1157,8 +1163,8 @@
|
159
|
+
fStrike = *ptr == 1;
|
160
|
+
else if ( *ptr == 128 && paragraphStyle )
|
161
|
+
fStrike = paragraphStyle->chp().fStrike;
|
162
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
163
|
+
- fStrike = !( paragraphStyle->chp().fStrike );
|
164
|
+
+ else if ( *ptr == 129 )
|
165
|
+
+ fStrike = !( paragraphStyle && paragraphStyle->chp().fStrike );
|
166
|
+
else
|
167
|
+
wvlog << "Warning: sprmCFStrike couldn't find a style" << std::endl;
|
168
|
+
break;
|
169
|
+
@@ -1167,8 +1173,8 @@
|
170
|
+
fOutline = *ptr == 1;
|
171
|
+
else if ( *ptr == 128 && paragraphStyle )
|
172
|
+
fOutline = paragraphStyle->chp().fOutline;
|
173
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
174
|
+
- fOutline = !( paragraphStyle->chp().fOutline );
|
175
|
+
+ else if ( *ptr == 129 )
|
176
|
+
+ fOutline = !( paragraphStyle && paragraphStyle->chp().fOutline );
|
177
|
+
else
|
178
|
+
wvlog << "Warning: sprmCFOutline couldn't find a style" << std::endl;
|
179
|
+
break;
|
180
|
+
@@ -1177,8 +1183,8 @@
|
181
|
+
fShadow = *ptr == 1;
|
182
|
+
else if ( *ptr == 128 && paragraphStyle )
|
183
|
+
fShadow = paragraphStyle->chp().fShadow;
|
184
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
185
|
+
- fShadow = !( paragraphStyle->chp().fShadow );
|
186
|
+
+ else if ( *ptr == 129 )
|
187
|
+
+ fShadow = !( paragraphStyle && paragraphStyle->chp().fShadow );
|
188
|
+
else
|
189
|
+
wvlog << "Warning: sprmCFShadow couldn't find a style" << std::endl;
|
190
|
+
break;
|
191
|
+
@@ -1187,8 +1193,8 @@
|
192
|
+
fSmallCaps = *ptr == 1;
|
193
|
+
else if ( *ptr == 128 && paragraphStyle )
|
194
|
+
fSmallCaps = paragraphStyle->chp().fSmallCaps;
|
195
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
196
|
+
- fSmallCaps = !( paragraphStyle->chp().fSmallCaps );
|
197
|
+
+ else if ( *ptr == 129 )
|
198
|
+
+ fSmallCaps = !( paragraphStyle && paragraphStyle->chp().fSmallCaps );
|
199
|
+
else
|
200
|
+
wvlog << "Warning: sprmCFSmallCaps couldn't find a style" << std::endl;
|
201
|
+
break;
|
202
|
+
@@ -1197,8 +1203,8 @@
|
203
|
+
fCaps = *ptr == 1;
|
204
|
+
else if ( *ptr == 128 && paragraphStyle )
|
205
|
+
fCaps = paragraphStyle->chp().fCaps;
|
206
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
207
|
+
- fCaps = !( paragraphStyle->chp().fCaps );
|
208
|
+
+ else if ( *ptr == 129 )
|
209
|
+
+ fCaps = !( paragraphStyle && paragraphStyle->chp().fCaps );
|
210
|
+
else
|
211
|
+
wvlog << "Warning: sprmCFCaps couldn't find a style" << std::endl;
|
212
|
+
break;
|
213
|
+
@@ -1207,8 +1213,8 @@
|
214
|
+
fVanish = *ptr == 1;
|
215
|
+
else if ( *ptr == 128 && paragraphStyle )
|
216
|
+
fVanish = paragraphStyle->chp().fVanish;
|
217
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
218
|
+
- fVanish = !( paragraphStyle->chp().fVanish );
|
219
|
+
+ else if ( *ptr == 129 )
|
220
|
+
+ fVanish = !( paragraphStyle && paragraphStyle->chp().fVanish );
|
221
|
+
else
|
222
|
+
wvlog << "Warning: sprmCFVanish couldn't find a style" << std::endl;
|
223
|
+
break;
|
@@ -0,0 +1,16 @@
|
|
1
|
+
{\rtf1\ansi\deff0\adeflang1025
|
2
|
+
{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\fnil\fprq2\fcharset0 Arial;}{\f3\fnil\fprq0\fcharset0 Times New Roman;}{\f4\fnil\fprq2\fcharset0 HG Mincho Light J{\*\falt msmincho};}{\f5\fnil\fprq2\fcharset0 Lucidasans;}{\f6\fnil\fprq0\fcharset0 Lucidasans;}}
|
3
|
+
{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
|
4
|
+
{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057\snext1 Normal;}
|
5
|
+
{\s2\sb240\sa120\keepn\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs28\lang255\ltrch\dbch\af4\langfe255\hich\f3\fs28\lang2057\loch\f3\fs28\lang2057\sbasedon1\snext3 Heading;}
|
6
|
+
{\s3\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057\sbasedon1\snext3 Body Text;}
|
7
|
+
{\s4\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057\sbasedon3\snext4 List;}
|
8
|
+
{\s5\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\i\loch\f0\fs24\lang2057\i\sbasedon1\snext5 caption;}
|
9
|
+
{\s6\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057\sbasedon1\snext6 Index;}
|
10
|
+
}
|
11
|
+
{\info{\author Hannes Wyss}{\creatim\yr2008\mo2\dy19\hr11\min34}{\revtim\yr0\mo0\dy0\hr0\min0}{\printim\yr0\mo0\dy0\hr0\min0}{\comment StarWriter}{\vern6800}}\deftab709
|
12
|
+
{\*\pgdsctbl
|
13
|
+
{\pgdsc0\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}}
|
14
|
+
\paperh15840\paperw12240\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
|
15
|
+
\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057 {\rtlch \ltrch\loch\f0\fs24\lang2057\i0\b0 An RTF-File}
|
16
|
+
\par }
|
data/test/data/test.doc
ADDED
Binary file
|
data/test/data/test2.doc
ADDED
Binary file
|
data/test/data/test3.doc
ADDED
Binary file
|
data/test/data/test4.doc
ADDED
Binary file
|
data/test/data/test5.doc
ADDED
Binary file
|
data/test/data/test6.doc
ADDED
Binary file
|
data/test/data/test7.doc
ADDED
Binary file
|
data/test/data/test8.doc
ADDED
Binary file
|
data/test/data/test9.doc
ADDED
Binary file
|
data/test/test_parser.rb
ADDED
@@ -0,0 +1,644 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rwv2 -- Microsoft Word Parser extension
|
4
|
+
# Copyright (C) 2003 Hannes Wyss, ywesee - intellectual capital connected
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected
|
21
|
+
# Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
22
|
+
# hwyss@ywesee.com
|
23
|
+
#
|
24
|
+
# TestParser -- Rwv2 -- 21.8.2003 -- hwyss@ywesee.com
|
25
|
+
|
26
|
+
$: << File.expand_path('../ext/rwv2', File.dirname(__FILE__))
|
27
|
+
$: << File.expand_path('../lib', File.dirname(__FILE__))
|
28
|
+
|
29
|
+
require 'test/unit'
|
30
|
+
require 'rwv2/rwv2'
|
31
|
+
require 'RMagick'
|
32
|
+
require 'iconv'
|
33
|
+
|
34
|
+
class StubInlineReplacementHandler
|
35
|
+
attr_accessor :non_required_hyphen
|
36
|
+
def column_break
|
37
|
+
"c"
|
38
|
+
end
|
39
|
+
def hard_line_break
|
40
|
+
"\n"
|
41
|
+
end
|
42
|
+
def non_breaking_hyphen
|
43
|
+
"="
|
44
|
+
end
|
45
|
+
def non_breaking_space
|
46
|
+
"_"
|
47
|
+
end
|
48
|
+
def tab
|
49
|
+
"t"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
class StubIncompleteReplacementHandler
|
53
|
+
def hard_line_break
|
54
|
+
"\n"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
["body", "footnote", "headers", "header"].each { |tpe|
|
58
|
+
eval <<-EOF
|
59
|
+
class StubSubDocumentHandler
|
60
|
+
attr_reader :#{tpe}_starts, :#{tpe}_ends
|
61
|
+
def #{tpe}_start
|
62
|
+
@#{tpe}_starts ||= 0
|
63
|
+
@#{tpe}_starts += 1
|
64
|
+
end
|
65
|
+
def #{tpe}_end
|
66
|
+
@#{tpe}_ends ||= 0
|
67
|
+
@#{tpe}_ends += 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
EOF
|
71
|
+
}
|
72
|
+
class StubSubDocumentHandler
|
73
|
+
attr_reader :header_starts
|
74
|
+
def initialize
|
75
|
+
@header_starts = []
|
76
|
+
end
|
77
|
+
def header_start(header_type)
|
78
|
+
@header_starts << header_type
|
79
|
+
end
|
80
|
+
end
|
81
|
+
class StubTextHandler
|
82
|
+
attr_accessor :texts, :formats, :section_properties, :section_ends
|
83
|
+
attr_accessor :page_breaks, :paragraph_properties, :paragraph_ends, :pictures
|
84
|
+
def initialize
|
85
|
+
@iconv = Iconv.new('utf8', 'utf-16')
|
86
|
+
@pictures = []
|
87
|
+
@formats = []
|
88
|
+
@texts = []
|
89
|
+
@section_properties = []
|
90
|
+
@paragraph_properties = []
|
91
|
+
@section_ends = 0
|
92
|
+
@paragraph_ends = 0
|
93
|
+
@page_breaks = 0
|
94
|
+
end
|
95
|
+
def picture(picture)
|
96
|
+
@pictures.push picture
|
97
|
+
end
|
98
|
+
def section_start(sep)
|
99
|
+
@section_properties << sep
|
100
|
+
end
|
101
|
+
def section_end
|
102
|
+
@section_ends += 1
|
103
|
+
end
|
104
|
+
def page_break
|
105
|
+
@page_breaks += 1
|
106
|
+
end
|
107
|
+
def paragraph_start(pap)
|
108
|
+
@paragraph_properties << pap
|
109
|
+
end
|
110
|
+
def paragraph_end
|
111
|
+
@paragraph_ends += 1
|
112
|
+
end
|
113
|
+
def run_of_text(text, format=nil)
|
114
|
+
@formats << format unless format.nil?
|
115
|
+
@texts << @iconv.iconv(text)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
class StubTableHandler
|
119
|
+
attr_reader :row_starts, :row_ends, :cell_starts, :cell_ends
|
120
|
+
def initialize
|
121
|
+
@row_starts = []
|
122
|
+
@row_ends = 0
|
123
|
+
@cell_starts = 0
|
124
|
+
@cell_ends = 0
|
125
|
+
end
|
126
|
+
def row_start(properties=nil)
|
127
|
+
@row_starts << properties
|
128
|
+
end
|
129
|
+
def row_end
|
130
|
+
@row_ends += 1
|
131
|
+
end
|
132
|
+
def cell_start
|
133
|
+
@cell_starts += 1
|
134
|
+
end
|
135
|
+
def cell_end
|
136
|
+
@cell_ends += 1
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
class TestRwv2Parser < Test::Unit::TestCase
|
141
|
+
def setup
|
142
|
+
@filename = File.expand_path('data/test.doc', File.dirname(__FILE__))
|
143
|
+
@filename2 = File.expand_path('data/test2.doc', File.dirname(__FILE__))
|
144
|
+
@filename3 = File.expand_path('data/test3.doc', File.dirname(__FILE__))
|
145
|
+
@filename4 = File.expand_path('data/test4.doc', File.dirname(__FILE__))
|
146
|
+
@filename5 = File.expand_path('data/test5.doc', File.dirname(__FILE__))
|
147
|
+
@filename6 = File.expand_path('data/test6.doc', File.dirname(__FILE__))
|
148
|
+
@filename7 = File.expand_path('data/test7.doc', File.dirname(__FILE__))
|
149
|
+
@filename8 = File.expand_path('data/test8.doc', File.dirname(__FILE__))
|
150
|
+
@filename9 = File.expand_path('data/test9.doc', File.dirname(__FILE__))
|
151
|
+
@unavailable = File.expand_path('data/unavailable.doc', File.dirname(__FILE__))
|
152
|
+
@rtf = File.expand_path('data/not_a_word_document.rtf', File.dirname(__FILE__))
|
153
|
+
@ir_handler = StubInlineReplacementHandler.new
|
154
|
+
@ir_handler.non_required_hyphen = "-"
|
155
|
+
end
|
156
|
+
def test_create_parser
|
157
|
+
assert_nothing_raised {
|
158
|
+
Rwv2.create_parser(@filename)
|
159
|
+
}
|
160
|
+
assert_nothing_raised {
|
161
|
+
Rwv2.create_parser_from_content(File.read(@filename))
|
162
|
+
}
|
163
|
+
end
|
164
|
+
def test_inline_replacement_handler
|
165
|
+
parser = Rwv2.create_parser(@filename)
|
166
|
+
handler = StubTextHandler.new
|
167
|
+
parser.set_text_handler(handler)
|
168
|
+
assert_nothing_raised {
|
169
|
+
parser.set_inline_replacement_handler(@ir_handler)
|
170
|
+
}
|
171
|
+
parser.parse
|
172
|
+
expected = [
|
173
|
+
"Paragraph 1, Standard",
|
174
|
+
"Paragraph 2, Bold",
|
175
|
+
"Paragraph 3, Italic",
|
176
|
+
"Paragraph 4, Underlined",
|
177
|
+
"Paragraph 5, Bold Italic",
|
178
|
+
"Paragraph 6, Bold Underlined",
|
179
|
+
"Paragraph 7, Italic Underlined",
|
180
|
+
"Paragraph 8, Bold Italic Underlined",
|
181
|
+
"Paragraph 9, ",
|
182
|
+
"mixed Formats",
|
183
|
+
"TabtTab",
|
184
|
+
"HardLineBreak\nHardLineBreak",
|
185
|
+
"ColumnBreakcColumnBreak",
|
186
|
+
"NonBreakingHyphen=NonBreakingHyphen",
|
187
|
+
"NonRequiredHyphen-NonRequiredHyphen",
|
188
|
+
"NonBreakingSpace",
|
189
|
+
"_",
|
190
|
+
"NonBreakingSpace",
|
191
|
+
]
|
192
|
+
assert_equal(expected, handler.texts)
|
193
|
+
end
|
194
|
+
def test_incomplete_replacement_handler
|
195
|
+
parser = Rwv2.create_parser(@filename)
|
196
|
+
handler = StubTextHandler.new
|
197
|
+
parser.set_text_handler(handler)
|
198
|
+
replacer = StubIncompleteReplacementHandler.new
|
199
|
+
assert_nothing_raised {
|
200
|
+
parser.set_inline_replacement_handler(replacer)
|
201
|
+
}
|
202
|
+
parser.parse
|
203
|
+
expected = [
|
204
|
+
"Paragraph 1, Standard",
|
205
|
+
"Paragraph 2, Bold",
|
206
|
+
"Paragraph 3, Italic",
|
207
|
+
"Paragraph 4, Underlined",
|
208
|
+
"Paragraph 5, Bold Italic",
|
209
|
+
"Paragraph 6, Bold Underlined",
|
210
|
+
"Paragraph 7, Italic Underlined",
|
211
|
+
"Paragraph 8, Bold Italic Underlined",
|
212
|
+
"Paragraph 9, ",
|
213
|
+
"mixed Formats",
|
214
|
+
"Tab\tTab",
|
215
|
+
"HardLineBreak\nHardLineBreak",
|
216
|
+
"ColumnBreak\016ColumnBreak",
|
217
|
+
"NonBreakingHyphen\036NonBreakingHyphen",
|
218
|
+
"NonRequiredHyphen\037NonRequiredHyphen",
|
219
|
+
"NonBreakingSpace",
|
220
|
+
"\302\240",
|
221
|
+
"NonBreakingSpace"
|
222
|
+
]
|
223
|
+
assert_equal(expected, handler.texts)
|
224
|
+
end
|
225
|
+
def test_illegal_replacement_handler
|
226
|
+
parser = Rwv2.create_parser(@filename)
|
227
|
+
handler = StubTextHandler.new
|
228
|
+
parser.set_text_handler(handler)
|
229
|
+
parser.set_inline_replacement_handler(@ir_handler)
|
230
|
+
@ir_handler.non_required_hyphen = ""
|
231
|
+
assert_raises(RuntimeError) {
|
232
|
+
parser.parse
|
233
|
+
}
|
234
|
+
@ir_handler.non_required_hyphen = "--"
|
235
|
+
assert_raises(RuntimeError) {
|
236
|
+
parser.parse
|
237
|
+
}
|
238
|
+
end
|
239
|
+
def test_subdocument_handler
|
240
|
+
parser = Rwv2.create_parser(@filename4)
|
241
|
+
handler = StubSubDocumentHandler.new
|
242
|
+
assert_nothing_raised {
|
243
|
+
parser.set_subdocument_handler(handler)
|
244
|
+
}
|
245
|
+
parser.parse
|
246
|
+
assert_equal(1, handler.body_starts)
|
247
|
+
assert_equal(1, handler.body_ends)
|
248
|
+
assert_equal(1, handler.footnote_starts)
|
249
|
+
assert_equal(1, handler.footnote_ends)
|
250
|
+
assert_equal(1, handler.headers_starts)
|
251
|
+
assert_equal(1, handler.headers_ends)
|
252
|
+
assert_equal(2, handler.header_ends)
|
253
|
+
assert_equal([Rwv2::HEADER_ODD, Rwv2::FOOTER_ODD], handler.header_starts)
|
254
|
+
end
|
255
|
+
def test_table_handler
|
256
|
+
parser = Rwv2.create_parser(@filename5)
|
257
|
+
handler = StubTableHandler.new
|
258
|
+
assert_nothing_raised {
|
259
|
+
parser.set_table_handler(handler)
|
260
|
+
}
|
261
|
+
parser.parse
|
262
|
+
assert_equal(6, handler.row_ends)
|
263
|
+
assert_equal(11, handler.cell_starts)
|
264
|
+
assert_equal(11, handler.cell_ends)
|
265
|
+
head = handler.row_starts.at(0)
|
266
|
+
row0 = handler.row_starts.at(1)
|
267
|
+
row1 = handler.row_starts.at(2)
|
268
|
+
row2 = handler.row_starts.at(3)
|
269
|
+
row3 = handler.row_starts.at(4)
|
270
|
+
row4 = handler.row_starts.at(5)
|
271
|
+
assert_equal(283, row0.row_height)
|
272
|
+
assert_equal(-283, row1.row_height)
|
273
|
+
assert_equal(2, head.row_cells)
|
274
|
+
assert_equal(2, row0.row_cells)
|
275
|
+
assert_equal(2, row1.row_cells)
|
276
|
+
assert_equal(1, row2.row_cells)
|
277
|
+
assert_equal(2, row3.row_cells)
|
278
|
+
assert_equal(2, row4.row_cells)
|
279
|
+
assert_equal(3, row0.cell_boundaries.size)
|
280
|
+
assert_equal(0, row0.cell_boundaries.at(0))
|
281
|
+
assert_equal(4818, row0.cell_boundaries.at(1))
|
282
|
+
assert_equal(9639, row0.cell_boundaries.at(2))
|
283
|
+
assert_equal(2, row0.cell_descriptors.size)
|
284
|
+
ct0 = row0.cell_descriptors.first
|
285
|
+
cta = row0.cell_descriptors.last
|
286
|
+
ctb = row1.cell_descriptors.first
|
287
|
+
assert_instance_of(Rwv2::TableProperties::CellDescriptor, ct0)
|
288
|
+
assert_equal(false, ct0.first_merged?)
|
289
|
+
assert_equal(Rwv2::TableProperties::CellDescriptor::ALIGN_TOP, ct0.vertical_align)
|
290
|
+
assert_equal(Rwv2::TableProperties::CellDescriptor::ALIGN_CENTER, cta.vertical_align)
|
291
|
+
assert_equal(Rwv2::TableProperties::CellDescriptor::ALIGN_BOTTOM, ctb.vertical_align)
|
292
|
+
ct2 = row3.cell_descriptors.first
|
293
|
+
assert_equal(true, ct2.vertical_merged?)
|
294
|
+
assert_equal(true, ct2.vertical_restart?)
|
295
|
+
# ct2 = row3.cell_descriptors.first
|
296
|
+
# assert_equal(true, ct2.vertical)
|
297
|
+
|
298
|
+
# FIXME: the following are untested,
|
299
|
+
# need a _real_ Wordfile to test...
|
300
|
+
# assert_equal(Rwv2::TableProperties::ALIGN_LEFT, row0.align)
|
301
|
+
# assert_equal(Rwv2::TableProperties::ALIGN_LEFT, row1.align)
|
302
|
+
# row0.gap_half
|
303
|
+
# assert_equal(true, row0.cant_split)
|
304
|
+
# assert_equal(false, row1.cant_split)
|
305
|
+
# ct1 = row2.cell_descriptors.first
|
306
|
+
# assert_equal(true, ct1.merged)
|
307
|
+
# assert_equal(true, ct1.first_merged)
|
308
|
+
# :rotate_font, :backward, :vertical_merged,
|
309
|
+
# :vertical_restart, :vertical_align
|
310
|
+
end
|
311
|
+
def test_text_handler
|
312
|
+
parser = Rwv2.create_parser(@filename)
|
313
|
+
handler = StubTextHandler.new
|
314
|
+
assert_nothing_raised {
|
315
|
+
parser.set_text_handler(handler)
|
316
|
+
}
|
317
|
+
parser.parse
|
318
|
+
expected = [
|
319
|
+
"Paragraph 1, Standard",
|
320
|
+
"Paragraph 2, Bold",
|
321
|
+
"Paragraph 3, Italic",
|
322
|
+
"Paragraph 4, Underlined",
|
323
|
+
"Paragraph 5, Bold Italic",
|
324
|
+
"Paragraph 6, Bold Underlined",
|
325
|
+
"Paragraph 7, Italic Underlined",
|
326
|
+
"Paragraph 8, Bold Italic Underlined",
|
327
|
+
"Paragraph 9, ",
|
328
|
+
"mixed Formats",
|
329
|
+
"Tab\tTab",
|
330
|
+
"HardLineBreak\vHardLineBreak",
|
331
|
+
"ColumnBreak\016ColumnBreak",
|
332
|
+
"NonBreakingHyphen\036NonBreakingHyphen",
|
333
|
+
"NonRequiredHyphen\037NonRequiredHyphen",
|
334
|
+
"NonBreakingSpace",
|
335
|
+
"\302\240",
|
336
|
+
"NonBreakingSpace"
|
337
|
+
]
|
338
|
+
assert_equal(expected, handler.texts)
|
339
|
+
paps = handler.paragraph_properties
|
340
|
+
assert_equal(16, paps.size)
|
341
|
+
assert_equal(16, handler.paragraph_ends)
|
342
|
+
pap0 = paps.at(0)
|
343
|
+
pap1 = paps.at(1)
|
344
|
+
pap2 = paps.at(2)
|
345
|
+
pap3 = paps.at(3)
|
346
|
+
pap4 = paps.at(4)
|
347
|
+
pap5 = paps.at(5)
|
348
|
+
pap6 = paps.at(6)
|
349
|
+
assert_instance_of(Rwv2::ParagraphProperties, pap0)
|
350
|
+
assert_equal(Rwv2::ALIGN_LEFT, pap0.align)
|
351
|
+
assert_equal(Rwv2::ALIGN_CENTER, pap1.align)
|
352
|
+
assert_equal(Rwv2::ALIGN_RIGHT, pap2.align)
|
353
|
+
assert_equal(Rwv2::ALIGN_JUSTIFY, pap3.align)
|
354
|
+
assert_equal(true, pap0.keep?)
|
355
|
+
assert_equal(false, pap0.keep_with_next?)
|
356
|
+
assert_equal(false, pap0.widow_control?)
|
357
|
+
assert_equal(false, pap1.keep?)
|
358
|
+
assert_equal(true, pap1.keep_with_next?)
|
359
|
+
assert_equal(false, pap1.widow_control?)
|
360
|
+
assert_equal(false, pap2.keep?)
|
361
|
+
assert_equal(false, pap2.keep_with_next?)
|
362
|
+
assert_equal(true, pap2.widow_control?)
|
363
|
+
assert_equal(false, pap3.keep?)
|
364
|
+
assert_equal(false, pap3.keep_with_next?)
|
365
|
+
assert_equal(false, pap3.widow_control?)
|
366
|
+
# FIXME does this work with a real Wordfile?
|
367
|
+
# assert_equal(false, pap0.page_break_before?)
|
368
|
+
# assert_equal(false, pap1.page_break_before?)
|
369
|
+
# assert_equal(false, pap2.page_break_before?)
|
370
|
+
# assert_equal(true, pap3.page_break_before?)
|
371
|
+
assert_equal(1680, pap4.indent_right)
|
372
|
+
assert_equal(0, pap4.indent_left)
|
373
|
+
assert_equal(0, pap4.indent_first_line)
|
374
|
+
assert_equal(0, pap5.indent_right)
|
375
|
+
assert_equal(570, pap5.indent_left)
|
376
|
+
assert_equal(0, pap5.indent_first_line)
|
377
|
+
assert_equal(0, pap6.indent_right)
|
378
|
+
assert_equal(570, pap6.indent_left)
|
379
|
+
assert_equal(-285, pap6.indent_first_line)
|
380
|
+
end
|
381
|
+
def test_incomplete_text_handler
|
382
|
+
parser = Rwv2.create_parser(@filename)
|
383
|
+
handler = Rwv2::TextHandler.new
|
384
|
+
parser.set_text_handler(handler)
|
385
|
+
assert_nothing_raised { parser.parse }
|
386
|
+
end
|
387
|
+
def test_character_properties
|
388
|
+
parser = Rwv2.create_parser(@filename)
|
389
|
+
handler = StubTextHandler.new
|
390
|
+
parser.set_text_handler(handler)
|
391
|
+
parser.parse
|
392
|
+
formats = handler.formats
|
393
|
+
assert_equal(false, formats.empty?,
|
394
|
+
"The Parser recorded no Character Properties")
|
395
|
+
bold = handler.formats.collect { |fmt| fmt.bold? }
|
396
|
+
expected = [1,4,5,7,9]
|
397
|
+
assert_equal(expected, index_select(formats, :bold?))
|
398
|
+
expected = [2,4,6,7]
|
399
|
+
assert_equal(expected, index_select(formats, :italic?))
|
400
|
+
end
|
401
|
+
def test_character_properties2
|
402
|
+
parser = Rwv2.create_parser(@filename2)
|
403
|
+
handler = StubTextHandler.new
|
404
|
+
parser.set_text_handler(handler)
|
405
|
+
parser.parse
|
406
|
+
expected = [
|
407
|
+
"The ", "new Text!",
|
408
|
+
"This will be deleted.",
|
409
|
+
"Outlined",
|
410
|
+
"Small Caps",
|
411
|
+
"Caps",
|
412
|
+
"Strikethrough",
|
413
|
+
"Shadow",
|
414
|
+
"Lower Case",
|
415
|
+
"Embossed",
|
416
|
+
"Engraved",
|
417
|
+
"Double Strikethrough",
|
418
|
+
]
|
419
|
+
assert_equal(expected, handler.texts)
|
420
|
+
formats = handler.formats
|
421
|
+
assert_equal(false, formats.empty?,
|
422
|
+
"The Parser recorded no Character Properties")
|
423
|
+
assert_equal([0,1], index_select(formats, :rev_mark?))
|
424
|
+
assert_equal([2], index_select(formats, :rev_mark_del?))
|
425
|
+
assert_equal([3], index_select(formats, :outline?))
|
426
|
+
assert_equal([4], index_select(formats, :small_caps?))
|
427
|
+
assert_equal([5], index_select(formats, :caps?))
|
428
|
+
assert_equal([6], index_select(formats, :strikethrough?))
|
429
|
+
assert_equal([7], index_select(formats, :shadow?))
|
430
|
+
#assert_equal([8], index_select(formats, :lowercase?)) # FIXME
|
431
|
+
# our test-file is made with Openoffice - which does not set the
|
432
|
+
# lowercase-flag in Word Files...
|
433
|
+
assert_equal([9], index_select(formats, :emboss?))
|
434
|
+
assert_equal([10], index_select(formats, :imprint?))
|
435
|
+
assert_equal([11], index_select(formats, :double_strikethrough?))
|
436
|
+
end
|
437
|
+
def test_character_properties3
|
438
|
+
parser = Rwv2.create_parser(@filename3)
|
439
|
+
handler = StubTextHandler.new
|
440
|
+
parser.set_text_handler(handler)
|
441
|
+
parser.parse
|
442
|
+
expected = [
|
443
|
+
"Normal",
|
444
|
+
"Superscript",
|
445
|
+
"Subscript",
|
446
|
+
"Single",
|
447
|
+
"By Word",
|
448
|
+
"Double",
|
449
|
+
"Dotted",
|
450
|
+
"Thick",
|
451
|
+
"Dash",
|
452
|
+
"Dot Dash",
|
453
|
+
"Dot Dot Dash",
|
454
|
+
"Wave",
|
455
|
+
]
|
456
|
+
assert_equal(expected, handler.texts)
|
457
|
+
formats = handler.formats
|
458
|
+
assert_equal(false, formats.empty?,
|
459
|
+
"The Parser recorded no Character Properties")
|
460
|
+
assert_equal(Rwv2::CharacterProperties::POSITION_NORMAL, formats[0].position)
|
461
|
+
assert_equal(Rwv2::CharacterProperties::POSITION_SUPERSCRIPT, formats[1].position)
|
462
|
+
assert_equal(Rwv2::CharacterProperties::POSITION_SUBSCRIPT, formats[2].position)
|
463
|
+
assert_equal(false, formats[0].underline)
|
464
|
+
#assert_equal([3,4,5,6,7,8,9,10,11], index_select(formats, :underline)) # FIXME
|
465
|
+
# Openoffice saves simple underline somewhere else?
|
466
|
+
assert_equal([4,5,6,7,8,9,10,11], index_select(formats, :underline)) # FIXME
|
467
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_NONE, formats[0].underline)
|
468
|
+
#assert_equal(Rwv2::CharacterProperties::UNDERLINE_SINGLE, formats[3].underline) # FIXME
|
469
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_BY_WORD, formats[4].underline)
|
470
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DOUBLE, formats[5].underline)
|
471
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DOTTED, formats[6].underline)
|
472
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_THICK, formats[7].underline)
|
473
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DASH, formats[8].underline)
|
474
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DOT_DASH, formats[9].underline)
|
475
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DOT_DOT_DASH, formats[10].underline)
|
476
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_WAVE, formats[11].underline)
|
477
|
+
assert_equal(100, formats[0].scale)
|
478
|
+
assert_equal(24, formats[0].fontsize)
|
479
|
+
end
|
480
|
+
def test_section_properties
|
481
|
+
parser = Rwv2.create_parser(@filename6)
|
482
|
+
handler = StubTextHandler.new
|
483
|
+
parser.set_text_handler(handler)
|
484
|
+
parser.parse
|
485
|
+
properties = handler.section_properties
|
486
|
+
assert_equal(5, properties.size)
|
487
|
+
assert_equal(5, handler.section_ends)
|
488
|
+
assert_equal(1, handler.page_breaks)
|
489
|
+
sect0 = properties.at(0)
|
490
|
+
sect1 = properties.at(1)
|
491
|
+
sect2 = properties.at(2)
|
492
|
+
sect3 = properties.at(3)
|
493
|
+
sect4 = properties.at(4)
|
494
|
+
assert_instance_of(Rwv2::SectionProperties, sect0)
|
495
|
+
assert_equal(Rwv2::SectionProperties::NUMBER_ARABIC, sect0.page_number_format)
|
496
|
+
assert_equal(false, sect0.title_page?)
|
497
|
+
assert_equal(false, sect0.unlocked?)
|
498
|
+
assert_equal(false, sect0.page_number_restart?)
|
499
|
+
assert_equal(false, sect0.line_numbering_modulus)
|
500
|
+
assert_equal(1, sect0.columns)
|
501
|
+
assert_equal(2, sect1.columns)
|
502
|
+
assert_equal(3, sect2.columns)
|
503
|
+
assert_equal(2, sect3.columns)
|
504
|
+
assert_equal(1, sect4.columns)
|
505
|
+
|
506
|
+
# FIXME: get a real wordfile...
|
507
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect0.break_code)
|
508
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect1.break_code)
|
509
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect2.break_code)
|
510
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect3.break_code)
|
511
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect4.break_code)
|
512
|
+
# assert_equal(Rwv2::SectionProperties::LINE_NUMBERING_PER_PAGE, sect0.line_numbering_code)
|
513
|
+
# assert_equal(true, sect0.endnote?)
|
514
|
+
# assert_equal(true, sect1.endnote?)
|
515
|
+
# assert_equal(false, sect2.endnote?)
|
516
|
+
# assert_equal(true, sect3.endnote?)
|
517
|
+
end
|
518
|
+
def test_gc
|
519
|
+
parser = Rwv2.create_parser(@filename)
|
520
|
+
parser.set_inline_replacement_handler(@ir_handler.dup)
|
521
|
+
parser.set_subdocument_handler(StubSubDocumentHandler.new)
|
522
|
+
parser.set_text_handler(StubTextHandler.new)
|
523
|
+
GC.start
|
524
|
+
sleep(0.5)
|
525
|
+
assert_nothing_raised {
|
526
|
+
parser.parse
|
527
|
+
}
|
528
|
+
end
|
529
|
+
def test_tab_descriptors
|
530
|
+
parser = Rwv2.create_parser(@filename7)
|
531
|
+
handler = StubTextHandler.new
|
532
|
+
assert_nothing_raised {
|
533
|
+
parser.set_text_handler(handler)
|
534
|
+
}
|
535
|
+
parser.parse
|
536
|
+
paps = handler.paragraph_properties
|
537
|
+
pap0 = paps.at(0)
|
538
|
+
assert_instance_of(Rwv2::ParagraphProperties, pap0)
|
539
|
+
tabs = pap0.tab_descriptors
|
540
|
+
assert_equal(4, tabs.size)
|
541
|
+
tab0, tab1, tab2, tab3 = tabs
|
542
|
+
assert_instance_of(Rwv2::TabDescriptor, tab0)
|
543
|
+
assert_equal(1410, tab0.position)
|
544
|
+
assert_equal(2835, tab1.position)
|
545
|
+
assert_equal(4230, tab2.position)
|
546
|
+
assert_equal(5655, tab3.position)
|
547
|
+
assert_equal(Rwv2::TabDescriptor::ALIGN_LEFT, tab0.align)
|
548
|
+
assert_equal(Rwv2::TabDescriptor::ALIGN_RIGHT, tab1.align)
|
549
|
+
assert_equal(Rwv2::TabDescriptor::ALIGN_CENTER, tab3.align)
|
550
|
+
# FIXME
|
551
|
+
# assert_equal(Rwv2::TabDescriptor::ALIGN_DECIMAL, tab2.align)
|
552
|
+
end
|
553
|
+
def test_unavailable
|
554
|
+
assert_raises(Errno::ENOENT) {
|
555
|
+
Rwv2.create_parser(@unavailable)
|
556
|
+
}
|
557
|
+
end
|
558
|
+
def test_invalid__rtf
|
559
|
+
assert_raises(ArgumentError) {
|
560
|
+
Rwv2.create_parser(@rtf)
|
561
|
+
}
|
562
|
+
begin
|
563
|
+
Rwv2.create_parser(@rtf)
|
564
|
+
rescue ArgumentError => err
|
565
|
+
assert_equal(sprintf("'#@rtf' is not a word-document."), err.message)
|
566
|
+
end
|
567
|
+
end
|
568
|
+
def test_invalid__rtf__from_content
|
569
|
+
assert_raises(ArgumentError) {
|
570
|
+
Rwv2.create_parser_from_content(File.read(@rtf))
|
571
|
+
}
|
572
|
+
begin
|
573
|
+
Rwv2.create_parser_from_content(File.read(@rtf))
|
574
|
+
rescue ArgumentError => err
|
575
|
+
assert_equal(sprintf("Input is not a word-document."), err.message)
|
576
|
+
end
|
577
|
+
end
|
578
|
+
def test_picture__word95
|
579
|
+
## later openoffice formats don't work yet.
|
580
|
+
handler = StubTextHandler.new
|
581
|
+
parser = Rwv2.create_parser(@filename8)
|
582
|
+
assert_nothing_raised {
|
583
|
+
parser.set_text_handler(handler)
|
584
|
+
}
|
585
|
+
assert_nothing_raised {
|
586
|
+
parser.parse
|
587
|
+
}
|
588
|
+
assert_equal(2, handler.pictures.size)
|
589
|
+
|
590
|
+
desc = handler.pictures.at(0)
|
591
|
+
assert_instance_of(Rwv2::PictureDescriptor, desc)
|
592
|
+
assert_equal(1146, desc.display_width)
|
593
|
+
assert_equal(1147, desc.display_height)
|
594
|
+
assert_equal(999, desc.scaling_horizontal)
|
595
|
+
assert_equal(999, desc.scaling_vertical)
|
596
|
+
assert_equal(0, desc.crop_left)
|
597
|
+
assert_equal(0, desc.crop_top)
|
598
|
+
assert_equal(0, desc.crop_right)
|
599
|
+
assert_equal(0, desc.crop_bottom)
|
600
|
+
assert_equal(false, desc.is_bitmap?)
|
601
|
+
assert_equal(false, desc.is_active_ole_object?)
|
602
|
+
|
603
|
+
pic, = Magick::Image.from_blob(desc.blob)
|
604
|
+
assert_equal(38, pic.rows)
|
605
|
+
assert_equal(38, pic.columns)
|
606
|
+
|
607
|
+
desc = handler.pictures.at(1)
|
608
|
+
assert_instance_of(Rwv2::PictureDescriptor, desc)
|
609
|
+
assert_equal(1145, desc.display_width)
|
610
|
+
assert_equal(1146, desc.display_height)
|
611
|
+
assert_equal(999, desc.scaling_horizontal)
|
612
|
+
assert_equal(999, desc.scaling_vertical)
|
613
|
+
assert_equal(0, desc.crop_left)
|
614
|
+
assert_equal(0, desc.crop_top)
|
615
|
+
assert_equal(0, desc.crop_right)
|
616
|
+
assert_equal(0, desc.crop_bottom)
|
617
|
+
assert_equal(false, desc.is_bitmap?)
|
618
|
+
assert_equal(false, desc.is_active_ole_object?)
|
619
|
+
|
620
|
+
pic, = Magick::Image.from_blob(desc.blob)
|
621
|
+
assert_equal(38, pic.rows)
|
622
|
+
assert_equal(38, pic.columns)
|
623
|
+
end
|
624
|
+
def test_special_characters
|
625
|
+
parser = Rwv2.create_parser(@filename9)
|
626
|
+
handler = StubTextHandler.new
|
627
|
+
assert_nothing_raised {
|
628
|
+
parser.set_text_handler(handler)
|
629
|
+
}
|
630
|
+
parser.parse
|
631
|
+
expected = [
|
632
|
+
"Ligature: \305\223",
|
633
|
+
]
|
634
|
+
assert_equal(expected, handler.texts)
|
635
|
+
end
|
636
|
+
# helper methods
|
637
|
+
def index_select(collection, symbol)
|
638
|
+
res = []
|
639
|
+
collection.each_with_index { |item, idx|
|
640
|
+
(res << idx) if(item.send(symbol))
|
641
|
+
}
|
642
|
+
res
|
643
|
+
end
|
644
|
+
end
|