rwv2 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +515 -0
- data/History.txt +5 -0
- data/INSTALL +112 -0
- data/InstalledFiles +3 -0
- data/Manifest.txt +42 -0
- data/README +92 -0
- data/README.txt +73 -0
- data/Rakefile +28 -0
- data/config.save +12 -0
- data/ext/rwv2/MANIFEST +9 -0
- data/ext/rwv2/Makefile +150 -0
- data/ext/rwv2/extconf.rb +33 -0
- data/ext/rwv2/include/rwv2.h +42 -0
- data/ext/rwv2/include/rwv2_associated_strings.h +40 -0
- data/ext/rwv2/include/rwv2_handlers.h +129 -0
- data/ext/rwv2/include/rwv2_parser.h +58 -0
- data/ext/rwv2/include/rwv2_properties.h +149 -0
- data/ext/rwv2/mkmf.log +12 -0
- data/ext/rwv2/rwv2.cpp +294 -0
- data/ext/rwv2/rwv2.o +0 -0
- data/ext/rwv2/rwv2.so +0 -0
- data/ext/rwv2/rwv2_handlers.cpp +81 -0
- data/ext/rwv2/rwv2_handlers.o +0 -0
- data/ext/rwv2/rwv2_parser.cpp +76 -0
- data/ext/rwv2/rwv2_parser.o +0 -0
- data/ext/rwv2/rwv2_properties.cpp +218 -0
- data/ext/rwv2/rwv2_properties.o +0 -0
- data/install.rb +1098 -0
- data/lib/rwv2/handlers.rb +52 -0
- data/lib/rwv2/rwv2.rb +28 -0
- data/rwv2-0.2.3.patch +223 -0
- data/test/data/not_a_word_document.rtf +16 -0
- data/test/data/test.doc +0 -0
- data/test/data/test2.doc +0 -0
- data/test/data/test3.doc +0 -0
- data/test/data/test4.doc +0 -0
- data/test/data/test5.doc +0 -0
- data/test/data/test6.doc +0 -0
- data/test/data/test7.doc +0 -0
- data/test/data/test8.doc +0 -0
- data/test/data/test9.doc +0 -0
- data/test/test_parser.rb +644 -0
- metadata +130 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rwv2 -- Microsoft Word Parser extension
|
4
|
+
# Copyright (C) 2003 Hannes Wyss, ywesee - intellectual capital connected
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected
|
21
|
+
# Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
22
|
+
# hwyss@ywesee.com
|
23
|
+
#
|
24
|
+
# Rwv2 -- Rwv2 -- 21.8.2003 -- hwyss@ywesee.com
|
25
|
+
|
26
|
+
module Rwv2
|
27
|
+
class SubDocumentHandler
|
28
|
+
def body_start; end
|
29
|
+
def body_end; end
|
30
|
+
def footnote_start; end
|
31
|
+
def footnote_end; end
|
32
|
+
def headers_start; end
|
33
|
+
def headers_end; end
|
34
|
+
def header_start(header_data); end
|
35
|
+
def header_end; end
|
36
|
+
end
|
37
|
+
class TableHandler
|
38
|
+
def row_start(table_properties); end
|
39
|
+
def row_end; end
|
40
|
+
def cell_start; end
|
41
|
+
def cell_end; end
|
42
|
+
end
|
43
|
+
class TextHandler
|
44
|
+
def section_start(section_properties); end
|
45
|
+
def section_end; end
|
46
|
+
def page_break; end
|
47
|
+
def paragraph_start(paragraph_properties); end
|
48
|
+
def paragraph_end; end
|
49
|
+
def run_of_text(text, character_properties); end
|
50
|
+
def picture(pict); end
|
51
|
+
end
|
52
|
+
end
|
data/lib/rwv2/rwv2.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rwv2 -- Microsoft Word Parser extension
|
4
|
+
# Copyright (C) 2003 Hannes Wyss, ywesee - intellectual capital connected
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected
|
21
|
+
# Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
22
|
+
# hwyss@ywesee.com
|
23
|
+
## rwv2 -- rwv2 -- 24.09.2003 -- rwaltert@ywesee.com
|
24
|
+
|
25
|
+
require 'rwv2'
|
26
|
+
require 'rwv2/handlers'
|
27
|
+
|
28
|
+
VERSION = '0.6.0'
|
data/rwv2-0.2.3.patch
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
diff -Nur wv2-0.2.3/src/handlers.cpp wv2-0.2.3-p1/src/handlers.cpp
|
2
|
+
--- wv2-0.2.3/src/handlers.cpp 2006-06-12 18:40:11.000000000 +0200
|
3
|
+
+++ wv2-0.2.3-p1/src/handlers.cpp 2008-02-18 18:20:26.000000000 +0100
|
4
|
+
@@ -177,6 +177,11 @@
|
5
|
+
{
|
6
|
+
}
|
7
|
+
|
8
|
+
+void TextHandler::pictureData( SharedPtr<const Word97::PICF> /*picf*/,
|
9
|
+
+ U8* /*buffer*/ )
|
10
|
+
+{
|
11
|
+
+}
|
12
|
+
+
|
13
|
+
void TextHandler::tableRowFound( const TableRowFunctor& tableRow, SharedPtr<const Word97::TAP> /*tap*/ )
|
14
|
+
{
|
15
|
+
tableRow();
|
16
|
+
diff -Nur wv2-0.2.3/src/handlers.h wv2-0.2.3-p1/src/handlers.h
|
17
|
+
--- wv2-0.2.3/src/handlers.h 2006-06-12 18:40:11.000000000 +0200
|
18
|
+
+++ wv2-0.2.3-p1/src/handlers.h 2008-02-18 18:20:30.000000000 +0100
|
19
|
+
@@ -251,7 +251,9 @@
|
20
|
+
* Very special characters (bad, bad name) are the ones which need additional
|
21
|
+
* information from the file (i.e. the plain "put the current date there" isn't sufficent).
|
22
|
+
*/
|
23
|
+
- enum VerySpecialCharacter { Picture = 1, FootnoteAuto = 2, FieldBegin = 19, FieldSeparator = 20,
|
24
|
+
+ enum VerySpecialCharacter { Picture = 1, FootnoteAuto = 2,
|
25
|
+
+ Drawing = 8,
|
26
|
+
+ FieldBegin = 19, FieldSeparator = 20,
|
27
|
+
FieldEnd = 21, FieldEscapeChar = 92 };
|
28
|
+
|
29
|
+
/**
|
30
|
+
@@ -299,6 +301,13 @@
|
31
|
+
virtual void fieldEnd( const FLD* fld, SharedPtr<const Word97::CHP> chp );
|
32
|
+
|
33
|
+
/**
|
34
|
+
+ * This method is called every time we find a picture.
|
35
|
+
+ * @param picf the picture-data.
|
36
|
+
+ */
|
37
|
+
+ virtual void pictureData( SharedPtr<const Word97::PICF> picf,
|
38
|
+
+ U8* buffer);
|
39
|
+
+
|
40
|
+
+ /**
|
41
|
+
* This method is called every time we find a table row. The default
|
42
|
+
* implementation invokes the functor, which triggers the parsing
|
43
|
+
* process for the given table row.
|
44
|
+
diff -Nur wv2-0.2.3/src/parser9x.cpp wv2-0.2.3-p1/src/parser9x.cpp
|
45
|
+
--- wv2-0.2.3/src/parser9x.cpp 2006-06-12 18:40:11.000000000 +0200
|
46
|
+
+++ wv2-0.2.3-p1/src/parser9x.cpp 2008-02-20 17:32:03.000000000 +0100
|
47
|
+
@@ -711,6 +711,7 @@
|
48
|
+
break;
|
49
|
+
|
50
|
+
// It has to be one of the very special characters...
|
51
|
+
+ case TextHandler::Drawing:
|
52
|
+
case TextHandler::Picture:
|
53
|
+
emitPictureData( chp );
|
54
|
+
break;
|
55
|
+
@@ -812,7 +813,6 @@
|
56
|
+
picf = new Word97::PICF( Word95::toWord97( Word95::PICF( stream, false ) ) );
|
57
|
+
else
|
58
|
+
picf = new Word97::PICF( stream, false );
|
59
|
+
- stream->pop();
|
60
|
+
|
61
|
+
if ( picf->cbHeader < 58 ) {
|
62
|
+
wvlog << "Error: Found an image with a PICF smaller than 58 bytes! Skipping the image." << std::endl;
|
63
|
+
@@ -838,8 +838,17 @@
|
64
|
+
<< std::endl << " dxaOrigin=" << picf->dxaOrigin << " dyaOrigin="
|
65
|
+
<< picf->dyaOrigin << std::endl;
|
66
|
+
#endif
|
67
|
+
- // for now
|
68
|
+
- delete picf;
|
69
|
+
+
|
70
|
+
+ /* extract the image blob */
|
71
|
+
+ stream->seek( chp->fcPic_fcObj_lTagObj + picf->cbHeader, G_SEEK_SET );
|
72
|
+
+ U32 len = picf->lcb - picf->cbHeader;
|
73
|
+
+ U8* buffer;
|
74
|
+
+ buffer = (U8 *)malloc(len * sizeof(U8));
|
75
|
+
+ stream->read(buffer, len);
|
76
|
+
+ m_textHandler->pictureData( picf, buffer );
|
77
|
+
+ free(buffer);
|
78
|
+
+
|
79
|
+
+ stream->pop();
|
80
|
+
}
|
81
|
+
|
82
|
+
void Parser9x::parseHeader( const HeaderData& data, unsigned char mask )
|
83
|
+
diff -Nur wv2-0.2.3/src/styles.cpp wv2-0.2.3-p1/src/styles.cpp
|
84
|
+
--- wv2-0.2.3/src/styles.cpp 2006-06-12 18:40:11.000000000 +0200
|
85
|
+
+++ wv2-0.2.3-p1/src/styles.cpp 2008-02-21 11:24:07.000000000 +0100
|
86
|
+
@@ -445,6 +445,9 @@
|
87
|
+
parentStyle = stylesheet.styleByIndex( m_std->istdBase );
|
88
|
+
if ( parentStyle ) {
|
89
|
+
const_cast<Style*>( parentStyle )->unwrapStyle( stylesheet, version );
|
90
|
+
+ // I'm getting Segfaults where there is no m_upechpx in parentStyle
|
91
|
+
+ if ( !parentStyle->m_upechpx )
|
92
|
+
+ parentStyle->m_upechpx = new UPECHPX();
|
93
|
+
bool ok;
|
94
|
+
m_upechpx->istd = stylesheet.indexByID( m_std->sti, ok );
|
95
|
+
mergeUpechpx( parentStyle, version );
|
96
|
+
@@ -665,7 +668,7 @@
|
97
|
+
else if ( cbStshi == Word97::STSHI::sizeOf )
|
98
|
+
m_stsh.read( tableStream, false );
|
99
|
+
else {
|
100
|
+
- wvlog << "Detected a different STSHI, check this (trying to read Word97 one)" << std::endl;
|
101
|
+
+ wvlog << "Detected a different STSHI, check this (trying to read Word97 one - probably Latent Style Data added in Word2003)" << std::endl;
|
102
|
+
m_stsh.read( tableStream, false );
|
103
|
+
}
|
104
|
+
|
105
|
+
diff -Nur wv2-0.2.3/src/word97_generated.h wv2-0.2.3-p1/src/word97_generated.h
|
106
|
+
--- wv2-0.2.3/src/word97_generated.h 2006-06-12 18:40:12.000000000 +0200
|
107
|
+
+++ wv2-0.2.3-p1/src/word97_generated.h 2008-02-21 11:03:07.000000000 +0100
|
108
|
+
@@ -8623,6 +8623,18 @@
|
109
|
+
*/
|
110
|
+
U16 rgftcStandardChpStsh[3];
|
111
|
+
|
112
|
+
+
|
113
|
+
+ /** introduced in Word2003 **/
|
114
|
+
+ /**
|
115
|
+
+ * size of each lsd in mpstilsd. The count of lsd's is stiMaxWhenSaved
|
116
|
+
+ */
|
117
|
+
+ //U16 cbLSD;
|
118
|
+
+
|
119
|
+
+ /**
|
120
|
+
+ * latent style data (stiMax == stiMaxWhenSaved upon save!)
|
121
|
+
+ */
|
122
|
+
+ //LSD mpstilsd[3];
|
123
|
+
+
|
124
|
+
}; // STSHI
|
125
|
+
|
126
|
+
bool operator==(const STSHI &lhs, const STSHI &rhs);
|
127
|
+
diff -Nur wv2-0.2.3/src/word97_helper.cpp wv2-0.2.3-p1/src/word97_helper.cpp
|
128
|
+
--- wv2-0.2.3/src/word97_helper.cpp 2006-06-12 18:50:45.000000000 +0200
|
129
|
+
+++ wv2-0.2.3-p1/src/word97_helper.cpp 2008-02-21 11:33:21.000000000 +0100
|
130
|
+
@@ -1137,8 +1137,14 @@
|
131
|
+
fBold = *ptr == 1;
|
132
|
+
else if ( *ptr == 128 && paragraphStyle )
|
133
|
+
fBold = paragraphStyle->chp().fBold;
|
134
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
135
|
+
- fBold = !( paragraphStyle->chp().fBold );
|
136
|
+
+ else if ( *ptr == 129 )
|
137
|
+
+ /**
|
138
|
+
+ * there are some Word-Documents where the Nil Style seems to be
|
139
|
+
+ * defined as Reversed. Obviously the Nil Style has no
|
140
|
+
+ * ParentStyle, which is why I've moved the paragraphStyle-check
|
141
|
+
+ * from the else if clause down to the next line.
|
142
|
+
+ */
|
143
|
+
+ fBold = !( paragraphStyle && paragraphStyle->chp().fBold );
|
144
|
+
else
|
145
|
+
wvlog << "Warning: sprmCFBold couldn't find a style" << std::endl;
|
146
|
+
break;
|
147
|
+
@@ -1147,8 +1153,8 @@
|
148
|
+
fItalic = *ptr == 1;
|
149
|
+
else if ( *ptr == 128 && paragraphStyle )
|
150
|
+
fItalic = paragraphStyle->chp().fItalic;
|
151
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
152
|
+
- fItalic = !( paragraphStyle->chp().fItalic );
|
153
|
+
+ else if ( *ptr == 129 )
|
154
|
+
+ fItalic = !( paragraphStyle && paragraphStyle->chp().fItalic );
|
155
|
+
else
|
156
|
+
wvlog << "Warning: sprmCFItalic couldn't find a style" << std::endl;
|
157
|
+
break;
|
158
|
+
@@ -1157,8 +1163,8 @@
|
159
|
+
fStrike = *ptr == 1;
|
160
|
+
else if ( *ptr == 128 && paragraphStyle )
|
161
|
+
fStrike = paragraphStyle->chp().fStrike;
|
162
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
163
|
+
- fStrike = !( paragraphStyle->chp().fStrike );
|
164
|
+
+ else if ( *ptr == 129 )
|
165
|
+
+ fStrike = !( paragraphStyle && paragraphStyle->chp().fStrike );
|
166
|
+
else
|
167
|
+
wvlog << "Warning: sprmCFStrike couldn't find a style" << std::endl;
|
168
|
+
break;
|
169
|
+
@@ -1167,8 +1173,8 @@
|
170
|
+
fOutline = *ptr == 1;
|
171
|
+
else if ( *ptr == 128 && paragraphStyle )
|
172
|
+
fOutline = paragraphStyle->chp().fOutline;
|
173
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
174
|
+
- fOutline = !( paragraphStyle->chp().fOutline );
|
175
|
+
+ else if ( *ptr == 129 )
|
176
|
+
+ fOutline = !( paragraphStyle && paragraphStyle->chp().fOutline );
|
177
|
+
else
|
178
|
+
wvlog << "Warning: sprmCFOutline couldn't find a style" << std::endl;
|
179
|
+
break;
|
180
|
+
@@ -1177,8 +1183,8 @@
|
181
|
+
fShadow = *ptr == 1;
|
182
|
+
else if ( *ptr == 128 && paragraphStyle )
|
183
|
+
fShadow = paragraphStyle->chp().fShadow;
|
184
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
185
|
+
- fShadow = !( paragraphStyle->chp().fShadow );
|
186
|
+
+ else if ( *ptr == 129 )
|
187
|
+
+ fShadow = !( paragraphStyle && paragraphStyle->chp().fShadow );
|
188
|
+
else
|
189
|
+
wvlog << "Warning: sprmCFShadow couldn't find a style" << std::endl;
|
190
|
+
break;
|
191
|
+
@@ -1187,8 +1193,8 @@
|
192
|
+
fSmallCaps = *ptr == 1;
|
193
|
+
else if ( *ptr == 128 && paragraphStyle )
|
194
|
+
fSmallCaps = paragraphStyle->chp().fSmallCaps;
|
195
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
196
|
+
- fSmallCaps = !( paragraphStyle->chp().fSmallCaps );
|
197
|
+
+ else if ( *ptr == 129 )
|
198
|
+
+ fSmallCaps = !( paragraphStyle && paragraphStyle->chp().fSmallCaps );
|
199
|
+
else
|
200
|
+
wvlog << "Warning: sprmCFSmallCaps couldn't find a style" << std::endl;
|
201
|
+
break;
|
202
|
+
@@ -1197,8 +1203,8 @@
|
203
|
+
fCaps = *ptr == 1;
|
204
|
+
else if ( *ptr == 128 && paragraphStyle )
|
205
|
+
fCaps = paragraphStyle->chp().fCaps;
|
206
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
207
|
+
- fCaps = !( paragraphStyle->chp().fCaps );
|
208
|
+
+ else if ( *ptr == 129 )
|
209
|
+
+ fCaps = !( paragraphStyle && paragraphStyle->chp().fCaps );
|
210
|
+
else
|
211
|
+
wvlog << "Warning: sprmCFCaps couldn't find a style" << std::endl;
|
212
|
+
break;
|
213
|
+
@@ -1207,8 +1213,8 @@
|
214
|
+
fVanish = *ptr == 1;
|
215
|
+
else if ( *ptr == 128 && paragraphStyle )
|
216
|
+
fVanish = paragraphStyle->chp().fVanish;
|
217
|
+
- else if ( *ptr == 129 && paragraphStyle )
|
218
|
+
- fVanish = !( paragraphStyle->chp().fVanish );
|
219
|
+
+ else if ( *ptr == 129 )
|
220
|
+
+ fVanish = !( paragraphStyle && paragraphStyle->chp().fVanish );
|
221
|
+
else
|
222
|
+
wvlog << "Warning: sprmCFVanish couldn't find a style" << std::endl;
|
223
|
+
break;
|
@@ -0,0 +1,16 @@
|
|
1
|
+
{\rtf1\ansi\deff0\adeflang1025
|
2
|
+
{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\fnil\fprq2\fcharset0 Arial;}{\f3\fnil\fprq0\fcharset0 Times New Roman;}{\f4\fnil\fprq2\fcharset0 HG Mincho Light J{\*\falt msmincho};}{\f5\fnil\fprq2\fcharset0 Lucidasans;}{\f6\fnil\fprq0\fcharset0 Lucidasans;}}
|
3
|
+
{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
|
4
|
+
{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057\snext1 Normal;}
|
5
|
+
{\s2\sb240\sa120\keepn\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs28\lang255\ltrch\dbch\af4\langfe255\hich\f3\fs28\lang2057\loch\f3\fs28\lang2057\sbasedon1\snext3 Heading;}
|
6
|
+
{\s3\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057\sbasedon1\snext3 Body Text;}
|
7
|
+
{\s4\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057\sbasedon3\snext4 List;}
|
8
|
+
{\s5\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\i\loch\f0\fs24\lang2057\i\sbasedon1\snext5 caption;}
|
9
|
+
{\s6\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057\sbasedon1\snext6 Index;}
|
10
|
+
}
|
11
|
+
{\info{\author Hannes Wyss}{\creatim\yr2008\mo2\dy19\hr11\min34}{\revtim\yr0\mo0\dy0\hr0\min0}{\printim\yr0\mo0\dy0\hr0\min0}{\comment StarWriter}{\vern6800}}\deftab709
|
12
|
+
{\*\pgdsctbl
|
13
|
+
{\pgdsc0\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}}
|
14
|
+
\paperh15840\paperw12240\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
|
15
|
+
\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af2\langfe255\hich\f0\fs24\lang2057\loch\f0\fs24\lang2057 {\rtlch \ltrch\loch\f0\fs24\lang2057\i0\b0 An RTF-File}
|
16
|
+
\par }
|
data/test/data/test.doc
ADDED
Binary file
|
data/test/data/test2.doc
ADDED
Binary file
|
data/test/data/test3.doc
ADDED
Binary file
|
data/test/data/test4.doc
ADDED
Binary file
|
data/test/data/test5.doc
ADDED
Binary file
|
data/test/data/test6.doc
ADDED
Binary file
|
data/test/data/test7.doc
ADDED
Binary file
|
data/test/data/test8.doc
ADDED
Binary file
|
data/test/data/test9.doc
ADDED
Binary file
|
data/test/test_parser.rb
ADDED
@@ -0,0 +1,644 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rwv2 -- Microsoft Word Parser extension
|
4
|
+
# Copyright (C) 2003 Hannes Wyss, ywesee - intellectual capital connected
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected
|
21
|
+
# Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
22
|
+
# hwyss@ywesee.com
|
23
|
+
#
|
24
|
+
# TestParser -- Rwv2 -- 21.8.2003 -- hwyss@ywesee.com
|
25
|
+
|
26
|
+
$: << File.expand_path('../ext/rwv2', File.dirname(__FILE__))
|
27
|
+
$: << File.expand_path('../lib', File.dirname(__FILE__))
|
28
|
+
|
29
|
+
require 'test/unit'
|
30
|
+
require 'rwv2/rwv2'
|
31
|
+
require 'RMagick'
|
32
|
+
require 'iconv'
|
33
|
+
|
34
|
+
class StubInlineReplacementHandler
|
35
|
+
attr_accessor :non_required_hyphen
|
36
|
+
def column_break
|
37
|
+
"c"
|
38
|
+
end
|
39
|
+
def hard_line_break
|
40
|
+
"\n"
|
41
|
+
end
|
42
|
+
def non_breaking_hyphen
|
43
|
+
"="
|
44
|
+
end
|
45
|
+
def non_breaking_space
|
46
|
+
"_"
|
47
|
+
end
|
48
|
+
def tab
|
49
|
+
"t"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
class StubIncompleteReplacementHandler
|
53
|
+
def hard_line_break
|
54
|
+
"\n"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
["body", "footnote", "headers", "header"].each { |tpe|
|
58
|
+
eval <<-EOF
|
59
|
+
class StubSubDocumentHandler
|
60
|
+
attr_reader :#{tpe}_starts, :#{tpe}_ends
|
61
|
+
def #{tpe}_start
|
62
|
+
@#{tpe}_starts ||= 0
|
63
|
+
@#{tpe}_starts += 1
|
64
|
+
end
|
65
|
+
def #{tpe}_end
|
66
|
+
@#{tpe}_ends ||= 0
|
67
|
+
@#{tpe}_ends += 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
EOF
|
71
|
+
}
|
72
|
+
class StubSubDocumentHandler
|
73
|
+
attr_reader :header_starts
|
74
|
+
def initialize
|
75
|
+
@header_starts = []
|
76
|
+
end
|
77
|
+
def header_start(header_type)
|
78
|
+
@header_starts << header_type
|
79
|
+
end
|
80
|
+
end
|
81
|
+
class StubTextHandler
|
82
|
+
attr_accessor :texts, :formats, :section_properties, :section_ends
|
83
|
+
attr_accessor :page_breaks, :paragraph_properties, :paragraph_ends, :pictures
|
84
|
+
def initialize
|
85
|
+
@iconv = Iconv.new('utf8', 'utf-16')
|
86
|
+
@pictures = []
|
87
|
+
@formats = []
|
88
|
+
@texts = []
|
89
|
+
@section_properties = []
|
90
|
+
@paragraph_properties = []
|
91
|
+
@section_ends = 0
|
92
|
+
@paragraph_ends = 0
|
93
|
+
@page_breaks = 0
|
94
|
+
end
|
95
|
+
def picture(picture)
|
96
|
+
@pictures.push picture
|
97
|
+
end
|
98
|
+
def section_start(sep)
|
99
|
+
@section_properties << sep
|
100
|
+
end
|
101
|
+
def section_end
|
102
|
+
@section_ends += 1
|
103
|
+
end
|
104
|
+
def page_break
|
105
|
+
@page_breaks += 1
|
106
|
+
end
|
107
|
+
def paragraph_start(pap)
|
108
|
+
@paragraph_properties << pap
|
109
|
+
end
|
110
|
+
def paragraph_end
|
111
|
+
@paragraph_ends += 1
|
112
|
+
end
|
113
|
+
def run_of_text(text, format=nil)
|
114
|
+
@formats << format unless format.nil?
|
115
|
+
@texts << @iconv.iconv(text)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
class StubTableHandler
|
119
|
+
attr_reader :row_starts, :row_ends, :cell_starts, :cell_ends
|
120
|
+
def initialize
|
121
|
+
@row_starts = []
|
122
|
+
@row_ends = 0
|
123
|
+
@cell_starts = 0
|
124
|
+
@cell_ends = 0
|
125
|
+
end
|
126
|
+
def row_start(properties=nil)
|
127
|
+
@row_starts << properties
|
128
|
+
end
|
129
|
+
def row_end
|
130
|
+
@row_ends += 1
|
131
|
+
end
|
132
|
+
def cell_start
|
133
|
+
@cell_starts += 1
|
134
|
+
end
|
135
|
+
def cell_end
|
136
|
+
@cell_ends += 1
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
class TestRwv2Parser < Test::Unit::TestCase
|
141
|
+
def setup
|
142
|
+
@filename = File.expand_path('data/test.doc', File.dirname(__FILE__))
|
143
|
+
@filename2 = File.expand_path('data/test2.doc', File.dirname(__FILE__))
|
144
|
+
@filename3 = File.expand_path('data/test3.doc', File.dirname(__FILE__))
|
145
|
+
@filename4 = File.expand_path('data/test4.doc', File.dirname(__FILE__))
|
146
|
+
@filename5 = File.expand_path('data/test5.doc', File.dirname(__FILE__))
|
147
|
+
@filename6 = File.expand_path('data/test6.doc', File.dirname(__FILE__))
|
148
|
+
@filename7 = File.expand_path('data/test7.doc', File.dirname(__FILE__))
|
149
|
+
@filename8 = File.expand_path('data/test8.doc', File.dirname(__FILE__))
|
150
|
+
@filename9 = File.expand_path('data/test9.doc', File.dirname(__FILE__))
|
151
|
+
@unavailable = File.expand_path('data/unavailable.doc', File.dirname(__FILE__))
|
152
|
+
@rtf = File.expand_path('data/not_a_word_document.rtf', File.dirname(__FILE__))
|
153
|
+
@ir_handler = StubInlineReplacementHandler.new
|
154
|
+
@ir_handler.non_required_hyphen = "-"
|
155
|
+
end
|
156
|
+
def test_create_parser
|
157
|
+
assert_nothing_raised {
|
158
|
+
Rwv2.create_parser(@filename)
|
159
|
+
}
|
160
|
+
assert_nothing_raised {
|
161
|
+
Rwv2.create_parser_from_content(File.read(@filename))
|
162
|
+
}
|
163
|
+
end
|
164
|
+
def test_inline_replacement_handler
|
165
|
+
parser = Rwv2.create_parser(@filename)
|
166
|
+
handler = StubTextHandler.new
|
167
|
+
parser.set_text_handler(handler)
|
168
|
+
assert_nothing_raised {
|
169
|
+
parser.set_inline_replacement_handler(@ir_handler)
|
170
|
+
}
|
171
|
+
parser.parse
|
172
|
+
expected = [
|
173
|
+
"Paragraph 1, Standard",
|
174
|
+
"Paragraph 2, Bold",
|
175
|
+
"Paragraph 3, Italic",
|
176
|
+
"Paragraph 4, Underlined",
|
177
|
+
"Paragraph 5, Bold Italic",
|
178
|
+
"Paragraph 6, Bold Underlined",
|
179
|
+
"Paragraph 7, Italic Underlined",
|
180
|
+
"Paragraph 8, Bold Italic Underlined",
|
181
|
+
"Paragraph 9, ",
|
182
|
+
"mixed Formats",
|
183
|
+
"TabtTab",
|
184
|
+
"HardLineBreak\nHardLineBreak",
|
185
|
+
"ColumnBreakcColumnBreak",
|
186
|
+
"NonBreakingHyphen=NonBreakingHyphen",
|
187
|
+
"NonRequiredHyphen-NonRequiredHyphen",
|
188
|
+
"NonBreakingSpace",
|
189
|
+
"_",
|
190
|
+
"NonBreakingSpace",
|
191
|
+
]
|
192
|
+
assert_equal(expected, handler.texts)
|
193
|
+
end
|
194
|
+
def test_incomplete_replacement_handler
|
195
|
+
parser = Rwv2.create_parser(@filename)
|
196
|
+
handler = StubTextHandler.new
|
197
|
+
parser.set_text_handler(handler)
|
198
|
+
replacer = StubIncompleteReplacementHandler.new
|
199
|
+
assert_nothing_raised {
|
200
|
+
parser.set_inline_replacement_handler(replacer)
|
201
|
+
}
|
202
|
+
parser.parse
|
203
|
+
expected = [
|
204
|
+
"Paragraph 1, Standard",
|
205
|
+
"Paragraph 2, Bold",
|
206
|
+
"Paragraph 3, Italic",
|
207
|
+
"Paragraph 4, Underlined",
|
208
|
+
"Paragraph 5, Bold Italic",
|
209
|
+
"Paragraph 6, Bold Underlined",
|
210
|
+
"Paragraph 7, Italic Underlined",
|
211
|
+
"Paragraph 8, Bold Italic Underlined",
|
212
|
+
"Paragraph 9, ",
|
213
|
+
"mixed Formats",
|
214
|
+
"Tab\tTab",
|
215
|
+
"HardLineBreak\nHardLineBreak",
|
216
|
+
"ColumnBreak\016ColumnBreak",
|
217
|
+
"NonBreakingHyphen\036NonBreakingHyphen",
|
218
|
+
"NonRequiredHyphen\037NonRequiredHyphen",
|
219
|
+
"NonBreakingSpace",
|
220
|
+
"\302\240",
|
221
|
+
"NonBreakingSpace"
|
222
|
+
]
|
223
|
+
assert_equal(expected, handler.texts)
|
224
|
+
end
|
225
|
+
def test_illegal_replacement_handler
|
226
|
+
parser = Rwv2.create_parser(@filename)
|
227
|
+
handler = StubTextHandler.new
|
228
|
+
parser.set_text_handler(handler)
|
229
|
+
parser.set_inline_replacement_handler(@ir_handler)
|
230
|
+
@ir_handler.non_required_hyphen = ""
|
231
|
+
assert_raises(RuntimeError) {
|
232
|
+
parser.parse
|
233
|
+
}
|
234
|
+
@ir_handler.non_required_hyphen = "--"
|
235
|
+
assert_raises(RuntimeError) {
|
236
|
+
parser.parse
|
237
|
+
}
|
238
|
+
end
|
239
|
+
def test_subdocument_handler
|
240
|
+
parser = Rwv2.create_parser(@filename4)
|
241
|
+
handler = StubSubDocumentHandler.new
|
242
|
+
assert_nothing_raised {
|
243
|
+
parser.set_subdocument_handler(handler)
|
244
|
+
}
|
245
|
+
parser.parse
|
246
|
+
assert_equal(1, handler.body_starts)
|
247
|
+
assert_equal(1, handler.body_ends)
|
248
|
+
assert_equal(1, handler.footnote_starts)
|
249
|
+
assert_equal(1, handler.footnote_ends)
|
250
|
+
assert_equal(1, handler.headers_starts)
|
251
|
+
assert_equal(1, handler.headers_ends)
|
252
|
+
assert_equal(2, handler.header_ends)
|
253
|
+
assert_equal([Rwv2::HEADER_ODD, Rwv2::FOOTER_ODD], handler.header_starts)
|
254
|
+
end
|
255
|
+
def test_table_handler
|
256
|
+
parser = Rwv2.create_parser(@filename5)
|
257
|
+
handler = StubTableHandler.new
|
258
|
+
assert_nothing_raised {
|
259
|
+
parser.set_table_handler(handler)
|
260
|
+
}
|
261
|
+
parser.parse
|
262
|
+
assert_equal(6, handler.row_ends)
|
263
|
+
assert_equal(11, handler.cell_starts)
|
264
|
+
assert_equal(11, handler.cell_ends)
|
265
|
+
head = handler.row_starts.at(0)
|
266
|
+
row0 = handler.row_starts.at(1)
|
267
|
+
row1 = handler.row_starts.at(2)
|
268
|
+
row2 = handler.row_starts.at(3)
|
269
|
+
row3 = handler.row_starts.at(4)
|
270
|
+
row4 = handler.row_starts.at(5)
|
271
|
+
assert_equal(283, row0.row_height)
|
272
|
+
assert_equal(-283, row1.row_height)
|
273
|
+
assert_equal(2, head.row_cells)
|
274
|
+
assert_equal(2, row0.row_cells)
|
275
|
+
assert_equal(2, row1.row_cells)
|
276
|
+
assert_equal(1, row2.row_cells)
|
277
|
+
assert_equal(2, row3.row_cells)
|
278
|
+
assert_equal(2, row4.row_cells)
|
279
|
+
assert_equal(3, row0.cell_boundaries.size)
|
280
|
+
assert_equal(0, row0.cell_boundaries.at(0))
|
281
|
+
assert_equal(4818, row0.cell_boundaries.at(1))
|
282
|
+
assert_equal(9639, row0.cell_boundaries.at(2))
|
283
|
+
assert_equal(2, row0.cell_descriptors.size)
|
284
|
+
ct0 = row0.cell_descriptors.first
|
285
|
+
cta = row0.cell_descriptors.last
|
286
|
+
ctb = row1.cell_descriptors.first
|
287
|
+
assert_instance_of(Rwv2::TableProperties::CellDescriptor, ct0)
|
288
|
+
assert_equal(false, ct0.first_merged?)
|
289
|
+
assert_equal(Rwv2::TableProperties::CellDescriptor::ALIGN_TOP, ct0.vertical_align)
|
290
|
+
assert_equal(Rwv2::TableProperties::CellDescriptor::ALIGN_CENTER, cta.vertical_align)
|
291
|
+
assert_equal(Rwv2::TableProperties::CellDescriptor::ALIGN_BOTTOM, ctb.vertical_align)
|
292
|
+
ct2 = row3.cell_descriptors.first
|
293
|
+
assert_equal(true, ct2.vertical_merged?)
|
294
|
+
assert_equal(true, ct2.vertical_restart?)
|
295
|
+
# ct2 = row3.cell_descriptors.first
|
296
|
+
# assert_equal(true, ct2.vertical)
|
297
|
+
|
298
|
+
# FIXME: the following are untested,
|
299
|
+
# need a _real_ Wordfile to test...
|
300
|
+
# assert_equal(Rwv2::TableProperties::ALIGN_LEFT, row0.align)
|
301
|
+
# assert_equal(Rwv2::TableProperties::ALIGN_LEFT, row1.align)
|
302
|
+
# row0.gap_half
|
303
|
+
# assert_equal(true, row0.cant_split)
|
304
|
+
# assert_equal(false, row1.cant_split)
|
305
|
+
# ct1 = row2.cell_descriptors.first
|
306
|
+
# assert_equal(true, ct1.merged)
|
307
|
+
# assert_equal(true, ct1.first_merged)
|
308
|
+
# :rotate_font, :backward, :vertical_merged,
|
309
|
+
# :vertical_restart, :vertical_align
|
310
|
+
end
|
311
|
+
def test_text_handler
|
312
|
+
parser = Rwv2.create_parser(@filename)
|
313
|
+
handler = StubTextHandler.new
|
314
|
+
assert_nothing_raised {
|
315
|
+
parser.set_text_handler(handler)
|
316
|
+
}
|
317
|
+
parser.parse
|
318
|
+
expected = [
|
319
|
+
"Paragraph 1, Standard",
|
320
|
+
"Paragraph 2, Bold",
|
321
|
+
"Paragraph 3, Italic",
|
322
|
+
"Paragraph 4, Underlined",
|
323
|
+
"Paragraph 5, Bold Italic",
|
324
|
+
"Paragraph 6, Bold Underlined",
|
325
|
+
"Paragraph 7, Italic Underlined",
|
326
|
+
"Paragraph 8, Bold Italic Underlined",
|
327
|
+
"Paragraph 9, ",
|
328
|
+
"mixed Formats",
|
329
|
+
"Tab\tTab",
|
330
|
+
"HardLineBreak\vHardLineBreak",
|
331
|
+
"ColumnBreak\016ColumnBreak",
|
332
|
+
"NonBreakingHyphen\036NonBreakingHyphen",
|
333
|
+
"NonRequiredHyphen\037NonRequiredHyphen",
|
334
|
+
"NonBreakingSpace",
|
335
|
+
"\302\240",
|
336
|
+
"NonBreakingSpace"
|
337
|
+
]
|
338
|
+
assert_equal(expected, handler.texts)
|
339
|
+
paps = handler.paragraph_properties
|
340
|
+
assert_equal(16, paps.size)
|
341
|
+
assert_equal(16, handler.paragraph_ends)
|
342
|
+
pap0 = paps.at(0)
|
343
|
+
pap1 = paps.at(1)
|
344
|
+
pap2 = paps.at(2)
|
345
|
+
pap3 = paps.at(3)
|
346
|
+
pap4 = paps.at(4)
|
347
|
+
pap5 = paps.at(5)
|
348
|
+
pap6 = paps.at(6)
|
349
|
+
assert_instance_of(Rwv2::ParagraphProperties, pap0)
|
350
|
+
assert_equal(Rwv2::ALIGN_LEFT, pap0.align)
|
351
|
+
assert_equal(Rwv2::ALIGN_CENTER, pap1.align)
|
352
|
+
assert_equal(Rwv2::ALIGN_RIGHT, pap2.align)
|
353
|
+
assert_equal(Rwv2::ALIGN_JUSTIFY, pap3.align)
|
354
|
+
assert_equal(true, pap0.keep?)
|
355
|
+
assert_equal(false, pap0.keep_with_next?)
|
356
|
+
assert_equal(false, pap0.widow_control?)
|
357
|
+
assert_equal(false, pap1.keep?)
|
358
|
+
assert_equal(true, pap1.keep_with_next?)
|
359
|
+
assert_equal(false, pap1.widow_control?)
|
360
|
+
assert_equal(false, pap2.keep?)
|
361
|
+
assert_equal(false, pap2.keep_with_next?)
|
362
|
+
assert_equal(true, pap2.widow_control?)
|
363
|
+
assert_equal(false, pap3.keep?)
|
364
|
+
assert_equal(false, pap3.keep_with_next?)
|
365
|
+
assert_equal(false, pap3.widow_control?)
|
366
|
+
# FIXME does this work with a real Wordfile?
|
367
|
+
# assert_equal(false, pap0.page_break_before?)
|
368
|
+
# assert_equal(false, pap1.page_break_before?)
|
369
|
+
# assert_equal(false, pap2.page_break_before?)
|
370
|
+
# assert_equal(true, pap3.page_break_before?)
|
371
|
+
assert_equal(1680, pap4.indent_right)
|
372
|
+
assert_equal(0, pap4.indent_left)
|
373
|
+
assert_equal(0, pap4.indent_first_line)
|
374
|
+
assert_equal(0, pap5.indent_right)
|
375
|
+
assert_equal(570, pap5.indent_left)
|
376
|
+
assert_equal(0, pap5.indent_first_line)
|
377
|
+
assert_equal(0, pap6.indent_right)
|
378
|
+
assert_equal(570, pap6.indent_left)
|
379
|
+
assert_equal(-285, pap6.indent_first_line)
|
380
|
+
end
|
381
|
+
def test_incomplete_text_handler
|
382
|
+
parser = Rwv2.create_parser(@filename)
|
383
|
+
handler = Rwv2::TextHandler.new
|
384
|
+
parser.set_text_handler(handler)
|
385
|
+
assert_nothing_raised { parser.parse }
|
386
|
+
end
|
387
|
+
def test_character_properties
|
388
|
+
parser = Rwv2.create_parser(@filename)
|
389
|
+
handler = StubTextHandler.new
|
390
|
+
parser.set_text_handler(handler)
|
391
|
+
parser.parse
|
392
|
+
formats = handler.formats
|
393
|
+
assert_equal(false, formats.empty?,
|
394
|
+
"The Parser recorded no Character Properties")
|
395
|
+
bold = handler.formats.collect { |fmt| fmt.bold? }
|
396
|
+
expected = [1,4,5,7,9]
|
397
|
+
assert_equal(expected, index_select(formats, :bold?))
|
398
|
+
expected = [2,4,6,7]
|
399
|
+
assert_equal(expected, index_select(formats, :italic?))
|
400
|
+
end
|
401
|
+
def test_character_properties2
|
402
|
+
parser = Rwv2.create_parser(@filename2)
|
403
|
+
handler = StubTextHandler.new
|
404
|
+
parser.set_text_handler(handler)
|
405
|
+
parser.parse
|
406
|
+
expected = [
|
407
|
+
"The ", "new Text!",
|
408
|
+
"This will be deleted.",
|
409
|
+
"Outlined",
|
410
|
+
"Small Caps",
|
411
|
+
"Caps",
|
412
|
+
"Strikethrough",
|
413
|
+
"Shadow",
|
414
|
+
"Lower Case",
|
415
|
+
"Embossed",
|
416
|
+
"Engraved",
|
417
|
+
"Double Strikethrough",
|
418
|
+
]
|
419
|
+
assert_equal(expected, handler.texts)
|
420
|
+
formats = handler.formats
|
421
|
+
assert_equal(false, formats.empty?,
|
422
|
+
"The Parser recorded no Character Properties")
|
423
|
+
assert_equal([0,1], index_select(formats, :rev_mark?))
|
424
|
+
assert_equal([2], index_select(formats, :rev_mark_del?))
|
425
|
+
assert_equal([3], index_select(formats, :outline?))
|
426
|
+
assert_equal([4], index_select(formats, :small_caps?))
|
427
|
+
assert_equal([5], index_select(formats, :caps?))
|
428
|
+
assert_equal([6], index_select(formats, :strikethrough?))
|
429
|
+
assert_equal([7], index_select(formats, :shadow?))
|
430
|
+
#assert_equal([8], index_select(formats, :lowercase?)) # FIXME
|
431
|
+
# our test-file is made with Openoffice - which does not set the
|
432
|
+
# lowercase-flag in Word Files...
|
433
|
+
assert_equal([9], index_select(formats, :emboss?))
|
434
|
+
assert_equal([10], index_select(formats, :imprint?))
|
435
|
+
assert_equal([11], index_select(formats, :double_strikethrough?))
|
436
|
+
end
|
437
|
+
def test_character_properties3
|
438
|
+
parser = Rwv2.create_parser(@filename3)
|
439
|
+
handler = StubTextHandler.new
|
440
|
+
parser.set_text_handler(handler)
|
441
|
+
parser.parse
|
442
|
+
expected = [
|
443
|
+
"Normal",
|
444
|
+
"Superscript",
|
445
|
+
"Subscript",
|
446
|
+
"Single",
|
447
|
+
"By Word",
|
448
|
+
"Double",
|
449
|
+
"Dotted",
|
450
|
+
"Thick",
|
451
|
+
"Dash",
|
452
|
+
"Dot Dash",
|
453
|
+
"Dot Dot Dash",
|
454
|
+
"Wave",
|
455
|
+
]
|
456
|
+
assert_equal(expected, handler.texts)
|
457
|
+
formats = handler.formats
|
458
|
+
assert_equal(false, formats.empty?,
|
459
|
+
"The Parser recorded no Character Properties")
|
460
|
+
assert_equal(Rwv2::CharacterProperties::POSITION_NORMAL, formats[0].position)
|
461
|
+
assert_equal(Rwv2::CharacterProperties::POSITION_SUPERSCRIPT, formats[1].position)
|
462
|
+
assert_equal(Rwv2::CharacterProperties::POSITION_SUBSCRIPT, formats[2].position)
|
463
|
+
assert_equal(false, formats[0].underline)
|
464
|
+
#assert_equal([3,4,5,6,7,8,9,10,11], index_select(formats, :underline)) # FIXME
|
465
|
+
# Openoffice saves simple underline somewhere else?
|
466
|
+
assert_equal([4,5,6,7,8,9,10,11], index_select(formats, :underline)) # FIXME
|
467
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_NONE, formats[0].underline)
|
468
|
+
#assert_equal(Rwv2::CharacterProperties::UNDERLINE_SINGLE, formats[3].underline) # FIXME
|
469
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_BY_WORD, formats[4].underline)
|
470
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DOUBLE, formats[5].underline)
|
471
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DOTTED, formats[6].underline)
|
472
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_THICK, formats[7].underline)
|
473
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DASH, formats[8].underline)
|
474
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DOT_DASH, formats[9].underline)
|
475
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_DOT_DOT_DASH, formats[10].underline)
|
476
|
+
assert_equal(Rwv2::CharacterProperties::UNDERLINE_WAVE, formats[11].underline)
|
477
|
+
assert_equal(100, formats[0].scale)
|
478
|
+
assert_equal(24, formats[0].fontsize)
|
479
|
+
end
|
480
|
+
def test_section_properties
|
481
|
+
parser = Rwv2.create_parser(@filename6)
|
482
|
+
handler = StubTextHandler.new
|
483
|
+
parser.set_text_handler(handler)
|
484
|
+
parser.parse
|
485
|
+
properties = handler.section_properties
|
486
|
+
assert_equal(5, properties.size)
|
487
|
+
assert_equal(5, handler.section_ends)
|
488
|
+
assert_equal(1, handler.page_breaks)
|
489
|
+
sect0 = properties.at(0)
|
490
|
+
sect1 = properties.at(1)
|
491
|
+
sect2 = properties.at(2)
|
492
|
+
sect3 = properties.at(3)
|
493
|
+
sect4 = properties.at(4)
|
494
|
+
assert_instance_of(Rwv2::SectionProperties, sect0)
|
495
|
+
assert_equal(Rwv2::SectionProperties::NUMBER_ARABIC, sect0.page_number_format)
|
496
|
+
assert_equal(false, sect0.title_page?)
|
497
|
+
assert_equal(false, sect0.unlocked?)
|
498
|
+
assert_equal(false, sect0.page_number_restart?)
|
499
|
+
assert_equal(false, sect0.line_numbering_modulus)
|
500
|
+
assert_equal(1, sect0.columns)
|
501
|
+
assert_equal(2, sect1.columns)
|
502
|
+
assert_equal(3, sect2.columns)
|
503
|
+
assert_equal(2, sect3.columns)
|
504
|
+
assert_equal(1, sect4.columns)
|
505
|
+
|
506
|
+
# FIXME: get a real wordfile...
|
507
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect0.break_code)
|
508
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect1.break_code)
|
509
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect2.break_code)
|
510
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect3.break_code)
|
511
|
+
# assert_equal(Rwv2::SectionProperties::BREAK_NONE, sect4.break_code)
|
512
|
+
# assert_equal(Rwv2::SectionProperties::LINE_NUMBERING_PER_PAGE, sect0.line_numbering_code)
|
513
|
+
# assert_equal(true, sect0.endnote?)
|
514
|
+
# assert_equal(true, sect1.endnote?)
|
515
|
+
# assert_equal(false, sect2.endnote?)
|
516
|
+
# assert_equal(true, sect3.endnote?)
|
517
|
+
end
|
518
|
+
def test_gc
|
519
|
+
parser = Rwv2.create_parser(@filename)
|
520
|
+
parser.set_inline_replacement_handler(@ir_handler.dup)
|
521
|
+
parser.set_subdocument_handler(StubSubDocumentHandler.new)
|
522
|
+
parser.set_text_handler(StubTextHandler.new)
|
523
|
+
GC.start
|
524
|
+
sleep(0.5)
|
525
|
+
assert_nothing_raised {
|
526
|
+
parser.parse
|
527
|
+
}
|
528
|
+
end
|
529
|
+
def test_tab_descriptors
|
530
|
+
parser = Rwv2.create_parser(@filename7)
|
531
|
+
handler = StubTextHandler.new
|
532
|
+
assert_nothing_raised {
|
533
|
+
parser.set_text_handler(handler)
|
534
|
+
}
|
535
|
+
parser.parse
|
536
|
+
paps = handler.paragraph_properties
|
537
|
+
pap0 = paps.at(0)
|
538
|
+
assert_instance_of(Rwv2::ParagraphProperties, pap0)
|
539
|
+
tabs = pap0.tab_descriptors
|
540
|
+
assert_equal(4, tabs.size)
|
541
|
+
tab0, tab1, tab2, tab3 = tabs
|
542
|
+
assert_instance_of(Rwv2::TabDescriptor, tab0)
|
543
|
+
assert_equal(1410, tab0.position)
|
544
|
+
assert_equal(2835, tab1.position)
|
545
|
+
assert_equal(4230, tab2.position)
|
546
|
+
assert_equal(5655, tab3.position)
|
547
|
+
assert_equal(Rwv2::TabDescriptor::ALIGN_LEFT, tab0.align)
|
548
|
+
assert_equal(Rwv2::TabDescriptor::ALIGN_RIGHT, tab1.align)
|
549
|
+
assert_equal(Rwv2::TabDescriptor::ALIGN_CENTER, tab3.align)
|
550
|
+
# FIXME
|
551
|
+
# assert_equal(Rwv2::TabDescriptor::ALIGN_DECIMAL, tab2.align)
|
552
|
+
end
|
553
|
+
def test_unavailable
|
554
|
+
assert_raises(Errno::ENOENT) {
|
555
|
+
Rwv2.create_parser(@unavailable)
|
556
|
+
}
|
557
|
+
end
|
558
|
+
def test_invalid__rtf
|
559
|
+
assert_raises(ArgumentError) {
|
560
|
+
Rwv2.create_parser(@rtf)
|
561
|
+
}
|
562
|
+
begin
|
563
|
+
Rwv2.create_parser(@rtf)
|
564
|
+
rescue ArgumentError => err
|
565
|
+
assert_equal(sprintf("'#@rtf' is not a word-document."), err.message)
|
566
|
+
end
|
567
|
+
end
|
568
|
+
def test_invalid__rtf__from_content
|
569
|
+
assert_raises(ArgumentError) {
|
570
|
+
Rwv2.create_parser_from_content(File.read(@rtf))
|
571
|
+
}
|
572
|
+
begin
|
573
|
+
Rwv2.create_parser_from_content(File.read(@rtf))
|
574
|
+
rescue ArgumentError => err
|
575
|
+
assert_equal(sprintf("Input is not a word-document."), err.message)
|
576
|
+
end
|
577
|
+
end
|
578
|
+
def test_picture__word95
|
579
|
+
## later openoffice formats don't work yet.
|
580
|
+
handler = StubTextHandler.new
|
581
|
+
parser = Rwv2.create_parser(@filename8)
|
582
|
+
assert_nothing_raised {
|
583
|
+
parser.set_text_handler(handler)
|
584
|
+
}
|
585
|
+
assert_nothing_raised {
|
586
|
+
parser.parse
|
587
|
+
}
|
588
|
+
assert_equal(2, handler.pictures.size)
|
589
|
+
|
590
|
+
desc = handler.pictures.at(0)
|
591
|
+
assert_instance_of(Rwv2::PictureDescriptor, desc)
|
592
|
+
assert_equal(1146, desc.display_width)
|
593
|
+
assert_equal(1147, desc.display_height)
|
594
|
+
assert_equal(999, desc.scaling_horizontal)
|
595
|
+
assert_equal(999, desc.scaling_vertical)
|
596
|
+
assert_equal(0, desc.crop_left)
|
597
|
+
assert_equal(0, desc.crop_top)
|
598
|
+
assert_equal(0, desc.crop_right)
|
599
|
+
assert_equal(0, desc.crop_bottom)
|
600
|
+
assert_equal(false, desc.is_bitmap?)
|
601
|
+
assert_equal(false, desc.is_active_ole_object?)
|
602
|
+
|
603
|
+
pic, = Magick::Image.from_blob(desc.blob)
|
604
|
+
assert_equal(38, pic.rows)
|
605
|
+
assert_equal(38, pic.columns)
|
606
|
+
|
607
|
+
desc = handler.pictures.at(1)
|
608
|
+
assert_instance_of(Rwv2::PictureDescriptor, desc)
|
609
|
+
assert_equal(1145, desc.display_width)
|
610
|
+
assert_equal(1146, desc.display_height)
|
611
|
+
assert_equal(999, desc.scaling_horizontal)
|
612
|
+
assert_equal(999, desc.scaling_vertical)
|
613
|
+
assert_equal(0, desc.crop_left)
|
614
|
+
assert_equal(0, desc.crop_top)
|
615
|
+
assert_equal(0, desc.crop_right)
|
616
|
+
assert_equal(0, desc.crop_bottom)
|
617
|
+
assert_equal(false, desc.is_bitmap?)
|
618
|
+
assert_equal(false, desc.is_active_ole_object?)
|
619
|
+
|
620
|
+
pic, = Magick::Image.from_blob(desc.blob)
|
621
|
+
assert_equal(38, pic.rows)
|
622
|
+
assert_equal(38, pic.columns)
|
623
|
+
end
|
624
|
+
def test_special_characters
|
625
|
+
parser = Rwv2.create_parser(@filename9)
|
626
|
+
handler = StubTextHandler.new
|
627
|
+
assert_nothing_raised {
|
628
|
+
parser.set_text_handler(handler)
|
629
|
+
}
|
630
|
+
parser.parse
|
631
|
+
expected = [
|
632
|
+
"Ligature: \305\223",
|
633
|
+
]
|
634
|
+
assert_equal(expected, handler.texts)
|
635
|
+
end
|
636
|
+
# helper methods
|
637
|
+
def index_select(collection, symbol)
|
638
|
+
res = []
|
639
|
+
collection.each_with_index { |item, idx|
|
640
|
+
(res << idx) if(item.send(symbol))
|
641
|
+
}
|
642
|
+
res
|
643
|
+
end
|
644
|
+
end
|