textractor 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,206 @@
1
+ @echo off
2
+
3
+ :: docx2txt, a command-line utility to convert Docx documents to text format.
4
+ :: Copyright (C) 2008-now Sandeep Kumar
5
+ ::
6
+ :: This program is free software; you can redistribute it and/or modify
7
+ :: it under the terms of the GNU General Public License as published by
8
+ :: the Free Software Foundation; either version 3 of the License, or
9
+ :: (at your option) any later version.
10
+ ::
11
+ :: This program is distributed in the hope that it will be useful,
12
+ :: but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ :: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ :: GNU General Public License for more details.
15
+ ::
16
+ :: You should have received a copy of the GNU General Public License
17
+ :: along with this program; if not, write to the Free Software
18
+ :: Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ ::
21
+ :: A simple commandline .docx to .txt converter
22
+ ::
23
+ :: This batch file is a wrapper around core docx2txt.pl script.
24
+ ::
25
+ :: Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
26
+ ::
27
+ :: ChangeLog :
28
+ ::
29
+ :: 17/09/2009 - Initial version of this file. It has similar functionality
30
+ :: as corresponding unix shell script.
31
+ :: 21/09/2009 - Updations to deal with paths containing spacess.
32
+ :: 22/09/2009 - Code reorganization, mainly around delayedexpansion command
33
+ :: extension.
34
+ :: 24/09/2009 - Required docx2txt.pl is expected in same location as this
35
+ :: batch file.
36
+ ::
37
+
38
+
39
+ ::
40
+ :: Set path (without surrounding quotes) to perl binary.
41
+ ::
42
+
43
+ set PERL=C:\Program Files\strawberry-perl-5.10.0.6\perl\bin\perl.exe
44
+
45
+ ::
46
+ :: If CAKECMD variable is set, batch file will unzip the content of argument
47
+ :: .docx file in a directory and pass that directory as the argument to the
48
+ :: docx2txt.pl script.
49
+ ::
50
+
51
+ :: set CAKECMD=C:\Program Files\cake\CakeCmd.exe
52
+
53
+
54
+ ::
55
+ :: Ensure that required command extensions are enabled.
56
+ ::
57
+
58
+ setlocal enableextensions
59
+ setlocal enabledelayedexpansion
60
+
61
+
62
+ ::
63
+ :: docx2txt.pl is expected to be in same location as this batch file.
64
+ ::
65
+
66
+ set DOCX2TXT_PL=%~dp0docx2txt.pl
67
+
68
+ if not exist "%DOCX2TXT_PL%" (
69
+ echo.
70
+ echo Can not continue without "%DOCX2TXT_PL%".
71
+ echo.
72
+ goto END
73
+ )
74
+
75
+
76
+ ::
77
+ :: Check if this batch file is invoked correctly.
78
+ ::
79
+ if "%~1" == "" goto USAGE
80
+ if not "%~2" == "" goto USAGE
81
+ goto CHECK_ARG
82
+
83
+
84
+ :USAGE
85
+
86
+ echo.
87
+ echo Usage : "%~0" file.docx
88
+ echo.
89
+ echo "file.docx" can also specify a directory holding the unzipped
90
+ echo content of a .docx file.
91
+ echo.
92
+ goto END
93
+
94
+
95
+ ::
96
+ :: Check if argument specifies a directory or a file.
97
+ ::
98
+
99
+ :CHECK_ARG
100
+
101
+ set INPARG=%~1
102
+
103
+ if exist %~s1\nul (
104
+ set ARGISDIR=y
105
+ :: Remove any trailing '\'s from input directory name.
106
+ :INP_IS_DIR
107
+ set LastChar=%INPARG:~-1%
108
+ if not "!LastChar!" == "\" goto GENERATE_TXTFILE_NAME
109
+ set INPARG=%INPARG:~0,-1%
110
+ goto INP_IS_DIR
111
+ ) else if not exist "%~1" (
112
+ echo.
113
+ echo Argument file/directory "%~1" does not exist.
114
+ echo.
115
+ goto END
116
+ )
117
+
118
+
119
+ ::
120
+ :: Generate output textfile name from input argument.
121
+ ::
122
+
123
+ :GENERATE_TXTFILE_NAME
124
+
125
+ set FILEEXT=%INPARG:~-5%
126
+ if "%FILEEXT%" == ".docx" (
127
+ set TXTFILE=%INPARG:~0,-5%.txt
128
+ ) else (
129
+ set TXTFILE=%INPARG%.txt
130
+ )
131
+
132
+
133
+ ::
134
+ :: Check whether output text file already exists, and whether user wants to
135
+ :: overwrite that.
136
+ ::
137
+
138
+ if exist "%TXTFILE%" (
139
+ echo.
140
+ echo Output file "%TXTFILE%" already exists.
141
+ set /P confirm=Overwrite "%TXTFILE%" [Y/N - Default Y] ?
142
+
143
+ if /I "!confirm!" == "N" (
144
+ echo.
145
+ echo Please copy "%TXTFILE%" somewhere else and rerun this batch file.
146
+ echo.
147
+ goto END
148
+ )
149
+ )
150
+
151
+
152
+ ::
153
+ :: Since docx2txt.pl script expects an unzipper that can send the extracted
154
+ :: file to stdout. If CakeCmd.exe is being used as unzipper, then extract the
155
+ :: contents into a directory and pass that directory as the argument to the
156
+ :: perl script.
157
+ ::
158
+
159
+ if defined ARGISDIR goto CONVERT
160
+
161
+ if defined CAKECMD (
162
+ rename "%~1" "%~1.zip"
163
+ echo y | "%CAKECMD%" extract "%~1.zip" \ "%~1" > nul
164
+ set RENAMEBACK=yes
165
+ )
166
+
167
+
168
+ ::
169
+ :: Invoke docx2txt.pl perl script to do the actual text extraction
170
+ ::
171
+
172
+ :CONVERT
173
+
174
+ "%PERL%" "%DOCX2TXT_PL%" "%INPARG%" "%TXTFILE%"
175
+
176
+ if %ERRORLEVEL% == 2 (
177
+ echo.
178
+ echo Failed to extract text from "%~1"!
179
+ echo.
180
+ ) else if %ERRORLEVEL% == 0 (
181
+ echo.
182
+ echo Text extracted from "%~1" is available in "%TXTFILE%".
183
+ echo.
184
+ )
185
+
186
+
187
+ :END
188
+
189
+ if defined RENAMEBACK (
190
+ rmdir /S /Q "%~1"
191
+ rename "%~1.zip" "%~1"
192
+ )
193
+
194
+ endlocal
195
+ endlocal
196
+
197
+ set PERL=
198
+ set DOCX2TXT_PL=
199
+ set CAKECMD=
200
+
201
+ set FILEEXT=
202
+ set INPARG=
203
+ set TXTFILE=
204
+ set ARGISDIR=
205
+ set RENAMEBACK=
206
+ set confirm=
@@ -0,0 +1,51 @@
1
+ #
2
+ # User controllable configuration parameters for docx2txt.pl
3
+ #
4
+ # Note:
5
+ # - Ensure that all configuration lines end with single comma (,).
6
+ # - Lines beginning with '#' are comments.
7
+ #
8
+
9
+ #
10
+ # Specify the path to "unzip" command.
11
+ #
12
+ # Windows users should specify this path like
13
+ #
14
+ # 'C:\Cygwin\bin\unzip.exe' (With Windows native perl.exe)
15
+ # Or
16
+ # 'C:/Cygwin/bin/unzip.exe' (With Cygwin/Windows native perl.exe)
17
+ #
18
+ # Default : '/usr/bin/unzip'
19
+ #
20
+ unzip => '/usr/bin/unzip',
21
+
22
+ #
23
+ # How the newline should be in output text file - "\n" or "\r\n".
24
+ #
25
+ # Default : "\n"
26
+ #
27
+ # newLine => "\n",
28
+
29
+ #
30
+ # How to indent nested lists - by "\t", " " or " " etc.
31
+ #
32
+ # Default : " "
33
+ #
34
+ # listIndent => " ",
35
+
36
+ #
37
+ # Line width to use for short line (single line paragraph) justifiction.
38
+ #
39
+ # Default : 80
40
+ #
41
+ # lineWidth => 80,
42
+
43
+ #
44
+ # Show hyperlink alongside linked text - [yY/nN]
45
+ #
46
+ # Note: Even if this option is enabled, hyperlinks will be shown only if
47
+ # hyperlink differs from the linked text.
48
+ #
49
+ # Default : N
50
+ #
51
+ showHyperLink => "Y",
@@ -0,0 +1,387 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # docx2txt, a command-line utility to convert Docx documents to text format.
4
+ # Copyright (C) 2008-2009 Sandeep Kumar
5
+ #
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation; either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, write to the Free Software
18
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ #
21
+ # This script extracts text from document.xml contained inside .docx file.
22
+ # Perl v5.8.2 was used for testing this script.
23
+ #
24
+ # Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
25
+ #
26
+ # ChangeLog :
27
+ #
28
+ # 10/08/2008 - Initial version (v0.1)
29
+ # 15/08/2008 - Script takes two arguments [second optional] now and can be
30
+ # used independently to extract text from docx file. It accepts
31
+ # docx file directly, instead of xml file.
32
+ # 18/08/2008 - Added support for center and right justification of text that
33
+ # fits in a line 80 characters wide (adjustable).
34
+ # 03/09/2008 - Fixed the slip in usage message.
35
+ # 12/09/2008 - Slightly changed the script invocation and argument handling
36
+ # to incorporate some of the shell script functionality here.
37
+ # Added support to handle embedded urls in docx document.
38
+ # 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
39
+ # Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
40
+ # during installation.
41
+ # 31/08/2009 - Added support for handling more escape characters.
42
+ # Using OS specific null device to redirect stderr.
43
+ # Saving text file in binary mode.
44
+ # 03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov
45
+ # (sergei>AT<dewia>DOT<com).
46
+ # - removal of non-document text in between TOC related tags.
47
+ # - display of hyperlink alongside linked text user controlled.
48
+ # - some character conversion updates
49
+ # 05/09/2009 - Merged cjustify and rjustify into single subroutine justify.
50
+ # Added more character conversions.
51
+ # Organised conversion mappings in tabular form for speedup and
52
+ # easy maintenance.
53
+ # Tweaked code to reduce number of passes over document content.
54
+ # 10/09/2009 - For leaner text experience, hyperlink is not displayed if
55
+ # hyperlink and hyperlinked text are same, even if user has
56
+ # enabled hyperlink display.
57
+ # Improved handling of short line justification. Many
58
+ # justification tag patterns were not captured earlier.
59
+ # 11/09/2009 - A directory holding the unzipped content of .docx file can
60
+ # also be specified as argument to the script, in place of file.
61
+ # 17/09/2009 - Removed trailing slashes from input directory name.
62
+ # Updated unzip command invocations to handle path names
63
+ # containing spaces.
64
+ # 01/10/2009 - Added support for configuration file.
65
+ # 02/10/2009 - Using single quotes to specify path for unzip command.
66
+ # 04/10/2009 - Corrected configuration option name lineIndent to listIndent.
67
+ #
68
+
69
+
70
+ #
71
+ # The default settings below can be overridden via docx2txt.config - searched
72
+ # first in current directory and then in the same location as this script.
73
+ #
74
+
75
+ our $unzip = '/usr/bin/unzip'; # Windows path like 'C:/path/to/unzip.exe'
76
+ our $newLine = "\n"; # Alternative is "\r\n".
77
+ our $listIndent = " "; # Indent nested lists by "\t", " " etc.
78
+ our $lineWidth = 80; # Line width, used for short line justification.
79
+ our $showHyperLink = "N"; # Show hyperlink alongside linked text.
80
+
81
+
82
+ # ToDo: Better list handling. Currently assumed 8 level nesting.
83
+ my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--');
84
+
85
+ #
86
+ # Character conversion tables
87
+ #
88
+
89
+ # Only amp, gt and lt are required for docx escapes, others are used for better
90
+ # text experience.
91
+ my %escChrs = ( amp => '&', gt => '>', lt => '<',
92
+ acute => '\'', brvbar => '|', copy => '(C)', divide => '/',
93
+ laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',
94
+ reg => '(R)', shy => '-', times => 'x'
95
+ );
96
+
97
+ my %splchars = (
98
+ "\xC2\xA0" => ' ', # <nbsp>
99
+ "\xC2\xA6" => '|', # <brokenbar>
100
+ "\xC2\xA9" => '(C)', # <copyright>
101
+ "\xC2\xAB" => '<<', # <laquo>
102
+ "\xC2\xAC" => '-', # <negate>
103
+ "\xC2\xAE" => '(R)', # <regd>
104
+ "\xC2\xB1" => '+-', # <plusminus>
105
+ "\xC2\xBB" => '>>', # <raquo>
106
+
107
+ # "\xC2\xA7" => '', # <section>
108
+ # "\xC2\xB6" => '', # <para>
109
+
110
+ "\xC3\x97" => 'x', # <mul>
111
+ "\xC3\xB7" => '/', # <div>
112
+
113
+ "\xE2\x80\x82" => ' ', # <enspc>
114
+ "\xE2\x80\x83" => ' ', # <emspc>
115
+ "\xE2\x80\x85" => ' ', # <qemsp>
116
+ "\xE2\x80\x93" => ' - ', # <endash>
117
+ "\xE2\x80\x94" => ' -- ', # <emdash>
118
+ "\xE2\x80\x98" => '`', # <soq>
119
+ "\xE2\x80\x99" => '\'', # <scq>
120
+ "\xE2\x80\x9C" => '"', # <doq>
121
+ "\xE2\x80\x9D" => '"', # <dcq>
122
+ "\xE2\x80\xA2" => '::', # <diamond symbol>
123
+ "\xE2\x80\xA6" => '...', # <ellipsis>
124
+
125
+ "\xE2\x84\xA2" => '(TM)', # <trademark>
126
+
127
+ "\xE2\x89\xA0" => '!=', # <neq>
128
+ "\xE2\x89\xA4" => '<=', # <leq>
129
+ "\xE2\x89\xA5" => '>=', # <geq>
130
+
131
+ #
132
+ # Currency symbols
133
+ #
134
+ "\xC2\xA2" => 'cent',
135
+ "\xC2\xA3" => 'Pound',
136
+ "\xC2\xA5" => 'Yen',
137
+ "\xE2\x82\xAC" => 'Euro'
138
+ );
139
+
140
+
141
+ #
142
+ # Check argument(s) sanity.
143
+ #
144
+
145
+ my $usage = <<USAGE;
146
+
147
+ Usage: $0 <infile.docx> [outfile.txt|-]
148
+
149
+ Use '-' as the outfile name to dump the text on STDOUT.
150
+ Output is saved in infile.txt if second argument is omitted.
151
+
152
+ infile.docx can also be a directory name holding the unzipped content
153
+ of concerned .docx file.
154
+
155
+ USAGE
156
+
157
+ die $usage if (@ARGV == 0 || @ARGV > 2);
158
+
159
+
160
+ #
161
+ # Check for existence and readability of required file in specified directory,
162
+ # and whether it is a text file.
163
+ #
164
+
165
+ sub check_for_required_file_in_folder {
166
+ stat("$_[1]/$_[0]");
167
+ die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _);
168
+ die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _;
169
+ }
170
+
171
+ sub readFileInto
172
+ {
173
+ local $/ = undef;
174
+ open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
175
+ binmode $fh;
176
+ $_[1] = <$fh>;
177
+ close $fh;
178
+ }
179
+
180
+
181
+ #
182
+ # Check whether first argument is specifying a directory holding extracted
183
+ # content of .docx file, or .docx file itself.
184
+ #
185
+
186
+ stat($ARGV[0]);
187
+
188
+ if (-d _) {
189
+ check_for_required_file_in_folder("word/document.xml", $ARGV[0]);
190
+ check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]);
191
+ $inpIsDir = 'y';
192
+ }
193
+ else {
194
+ die "Can't read docx file <$ARGV[0]>!\n" if ! (-f _ && -r _);
195
+ die "<$ARGV[0]> does not seem to be docx file!\n" if -T _;
196
+ }
197
+
198
+
199
+ #
200
+ # Get user configuration, if any.
201
+ #
202
+
203
+ my %config;
204
+
205
+ if (-f "docx2txt.config") {
206
+ %config = do 'docx2txt.config';
207
+ } elsif ($0 =~ m%^(.*[/\\])[^/\\]*?$%) {
208
+ %config = do "$1docx2txt.config" if (-f "$1docx2txt.config");
209
+ }
210
+
211
+ if (%config) {
212
+ foreach my $var (keys %config) {
213
+ $$var = $config{$var};
214
+ }
215
+ }
216
+
217
+
218
+ #
219
+ # Extract xml document content from argument docx file/directory.
220
+ #
221
+
222
+ if ($ENV{OS} =~ /^Windows/) {
223
+ $nulldevice = "nul";
224
+ } else {
225
+ $nulldevice = "/dev/null";
226
+ }
227
+
228
+ if ($inpIsDir eq 'y') {
229
+ readFileInto("$ARGV[0]/word/document.xml", $content);
230
+ } else {
231
+ $content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
232
+ }
233
+
234
+ die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
235
+
236
+
237
+ #
238
+ # Be ready for outputting the extracted text contents.
239
+ #
240
+
241
+ if (@ARGV == 1) {
242
+ $ARGV[1] = $ARGV[0];
243
+
244
+ # Remove any trailing slashes to generate proper output filename, when
245
+ # input is directory.
246
+ $ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y');
247
+
248
+ $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);
249
+ }
250
+
251
+ my $txtfile;
252
+ open($txtfile, "> $ARGV[1]") || die "Can't create <$ARGV[1]> for output!\n";
253
+ binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
254
+
255
+
256
+ #
257
+ # Gather information about header, footer, hyperlinks, images, footnotes etc.
258
+ #
259
+
260
+ if ($inpIsDir eq 'y') {
261
+ readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
262
+ } else {
263
+ $_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
264
+ }
265
+
266
+ my %docurels;
267
+ while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g)
268
+ {
269
+ $docurels{"$2:$1"} = $3;
270
+ }
271
+
272
+
273
+ #
274
+ # Subroutines for center and right justification of text in a line.
275
+ #
276
+
277
+ sub justify {
278
+ my $len = length $_[1];
279
+
280
+ if ($_[0] eq "center" && $len < ($lineWidth - 1)) {
281
+ return ' ' x (($lineWidth - $len) / 2) . $_[1];
282
+ } elsif ($_[0] eq "right" && $len < $lineWidth) {
283
+ return ' ' x ($lineWidth - $len) . $_[1];
284
+ } else {
285
+ return $_[1];
286
+ }
287
+ }
288
+
289
+ #
290
+ # Subroutines for dealing with embedded links and images
291
+ #
292
+
293
+ sub hyperlink {
294
+ my $hlrid = $_[0];
295
+ my $hltext = $_[1];
296
+ my $hlink = $docurels{"hyperlink:$hlrid"};
297
+
298
+ $hltext =~ s/<[^>]*?>//og;
299
+ $hltext .= " [HYPERLINK: $hlink]" if ($showHyperLink eq "y" && $hltext ne $hlink);
300
+
301
+ return $hltext;
302
+ }
303
+
304
+ #
305
+ # Subroutines for processing paragraph content.
306
+ #
307
+
308
+ sub processParagraph {
309
+ my $para = $_[0] . "$newLine";
310
+ my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/);
311
+
312
+ $para =~ s/<.*?>//og;
313
+ return justify($align,$para) if $align;
314
+
315
+ return $para;
316
+ }
317
+
318
+
319
+ #
320
+ # Force configuration value to lowercase as expected by script.
321
+ #
322
+ $showHyperLink = lc $showHyperLink;
323
+
324
+
325
+ #
326
+ # Text extraction starts.
327
+ #
328
+
329
+ my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");
330
+
331
+ $content =~ s/<?xml .*?\?>(\r)?\n//;
332
+
333
+ # Remove stuff between TOC related tags.
334
+ if ($content =~ m|<w:pStyle w:val="TOCHeading"/>|) {
335
+ $content =~ s|<w:instrText[^>]*>.*?</w:instrText>||og;
336
+ }
337
+
338
+ $content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og;
339
+
340
+ my $hr = '-' x $lineWidth . $newLine;
341
+ $content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;
342
+
343
+ $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . "$levchar[$1] "|oge;
344
+
345
+ #
346
+ # Uncomment either of below two lines and comment above line, if dealing
347
+ # with more than 8 level nested lists.
348
+ #
349
+
350
+ # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . '* '|oge;
351
+ # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' '|oge;
352
+
353
+ $content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;
354
+
355
+ $content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;
356
+
357
+ $content =~ s/<w:p [^>]+?>(.*?)<\/w:p>/processParagraph($1)/oge;
358
+
359
+ $content =~ s{<w:p [^/>]+?/>|</w:p>|<w:br/>}|$newLine|og;
360
+ $content =~ s/<.*?>//og;
361
+
362
+
363
+ #
364
+ # Convert non-ASCII characters/character sequences to ASCII characters.
365
+ #
366
+
367
+ $content =~ s/(\xE2..|\xC2.|\xC3.)/($splchars{$1} ? $splchars{$1} : $1)/oge;
368
+
369
+ #
370
+ # Convert docx specific escape chars first.
371
+ #
372
+ $content =~ s/(&)(amp|gt|lt)(;)/$escChrs{lc $2}/iog;
373
+
374
+ #
375
+ # Another pass for a better text experience, after sequences like "&amp;laquo;"
376
+ # are converted to "&laquo;".
377
+ #
378
+ $content =~ s/((&)([a-z]+)(;))/($escChrs{lc $3} ? $escChrs{lc $3} : $1)/ioge;
379
+
380
+
381
+ #
382
+ # Write the extracted and converted text contents to output.
383
+ #
384
+
385
+ print $txtfile $content;
386
+ close $txtfile;
387
+