textractor 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/textractor/document.rb +15 -7
- data/spec/document_spec.rb +22 -7
- data/spec/fixtures/document.docx +0 -0
- data/textractor.gemspec +19 -3
- data/vendor/docx2txt/AUTHORS +1 -0
- data/vendor/docx2txt/BSDmakefile +14 -0
- data/vendor/docx2txt/COPYING +674 -0
- data/vendor/docx2txt/ChangeLog +67 -0
- data/vendor/docx2txt/INSTALL +100 -0
- data/vendor/docx2txt/Makefile +23 -0
- data/vendor/docx2txt/README +109 -0
- data/vendor/docx2txt/ToDo +16 -0
- data/vendor/docx2txt/VERSION +1 -0
- data/vendor/docx2txt/WInstall.bat +218 -0
- data/vendor/docx2txt/docx2txt.bat +206 -0
- data/vendor/docx2txt/docx2txt.config +51 -0
- data/vendor/docx2txt/docx2txt.pl +387 -0
- data/vendor/docx2txt/docx2txt.sh +118 -0
- data/vendor/docx2txt/resume.docx +0 -0
- metadata +20 -4
@@ -0,0 +1,206 @@
|
|
1
|
+
@echo off
|
2
|
+
|
3
|
+
:: docx2txt, a command-line utility to convert Docx documents to text format.
|
4
|
+
:: Copyright (C) 2008-now Sandeep Kumar
|
5
|
+
::
|
6
|
+
:: This program is free software; you can redistribute it and/or modify
|
7
|
+
:: it under the terms of the GNU General Public License as published by
|
8
|
+
:: the Free Software Foundation; either version 3 of the License, or
|
9
|
+
:: (at your option) any later version.
|
10
|
+
::
|
11
|
+
:: This program is distributed in the hope that it will be useful,
|
12
|
+
:: but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
:: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
:: GNU General Public License for more details.
|
15
|
+
::
|
16
|
+
:: You should have received a copy of the GNU General Public License
|
17
|
+
:: along with this program; if not, write to the Free Software
|
18
|
+
:: Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
+
|
20
|
+
::
|
21
|
+
:: A simple commandline .docx to .txt converter
|
22
|
+
::
|
23
|
+
:: This batch file is a wrapper around core docx2txt.pl script.
|
24
|
+
::
|
25
|
+
:: Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
|
26
|
+
::
|
27
|
+
:: ChangeLog :
|
28
|
+
::
|
29
|
+
:: 17/09/2009 - Initial version of this file. It has similar functionality
|
30
|
+
:: as corresponding unix shell script.
|
31
|
+
:: 21/09/2009 - Updations to deal with paths containing spacess.
|
32
|
+
:: 22/09/2009 - Code reorganization, mainly around delayedexpansion command
|
33
|
+
:: extension.
|
34
|
+
:: 24/09/2009 - Required docx2txt.pl is expected in same location as this
|
35
|
+
:: batch file.
|
36
|
+
::
|
37
|
+
|
38
|
+
|
39
|
+
::
|
40
|
+
:: Set path (without surrounding quotes) to perl binary.
|
41
|
+
::
|
42
|
+
|
43
|
+
set PERL=C:\Program Files\strawberry-perl-5.10.0.6\perl\bin\perl.exe
|
44
|
+
|
45
|
+
::
|
46
|
+
:: If CAKECMD variable is set, batch file will unzip the content of argument
|
47
|
+
:: .docx file in a directory and pass that directory as the argument to the
|
48
|
+
:: docx2txt.pl script.
|
49
|
+
::
|
50
|
+
|
51
|
+
:: set CAKECMD=C:\Program Files\cake\CakeCmd.exe
|
52
|
+
|
53
|
+
|
54
|
+
::
|
55
|
+
:: Ensure that required command extensions are enabled.
|
56
|
+
::
|
57
|
+
|
58
|
+
setlocal enableextensions
|
59
|
+
setlocal enabledelayedexpansion
|
60
|
+
|
61
|
+
|
62
|
+
::
|
63
|
+
:: docx2txt.pl is expected to be in same location as this batch file.
|
64
|
+
::
|
65
|
+
|
66
|
+
set DOCX2TXT_PL=%~dp0docx2txt.pl
|
67
|
+
|
68
|
+
if not exist "%DOCX2TXT_PL%" (
|
69
|
+
echo.
|
70
|
+
echo Can not continue without "%DOCX2TXT_PL%".
|
71
|
+
echo.
|
72
|
+
goto END
|
73
|
+
)
|
74
|
+
|
75
|
+
|
76
|
+
::
|
77
|
+
:: Check if this batch file is invoked correctly.
|
78
|
+
::
|
79
|
+
if "%~1" == "" goto USAGE
|
80
|
+
if not "%~2" == "" goto USAGE
|
81
|
+
goto CHECK_ARG
|
82
|
+
|
83
|
+
|
84
|
+
:USAGE
|
85
|
+
|
86
|
+
echo.
|
87
|
+
echo Usage : "%~0" file.docx
|
88
|
+
echo.
|
89
|
+
echo "file.docx" can also specify a directory holding the unzipped
|
90
|
+
echo content of a .docx file.
|
91
|
+
echo.
|
92
|
+
goto END
|
93
|
+
|
94
|
+
|
95
|
+
::
|
96
|
+
:: Check if argument specifies a directory or a file.
|
97
|
+
::
|
98
|
+
|
99
|
+
:CHECK_ARG
|
100
|
+
|
101
|
+
set INPARG=%~1
|
102
|
+
|
103
|
+
if exist %~s1\nul (
|
104
|
+
set ARGISDIR=y
|
105
|
+
:: Remove any trailing '\'s from input directory name.
|
106
|
+
:INP_IS_DIR
|
107
|
+
set LastChar=%INPARG:~-1%
|
108
|
+
if not "!LastChar!" == "\" goto GENERATE_TXTFILE_NAME
|
109
|
+
set INPARG=%INPARG:~0,-1%
|
110
|
+
goto INP_IS_DIR
|
111
|
+
) else if not exist "%~1" (
|
112
|
+
echo.
|
113
|
+
echo Argument file/directory "%~1" does not exist.
|
114
|
+
echo.
|
115
|
+
goto END
|
116
|
+
)
|
117
|
+
|
118
|
+
|
119
|
+
::
|
120
|
+
:: Generate output textfile name from input argument.
|
121
|
+
::
|
122
|
+
|
123
|
+
:GENERATE_TXTFILE_NAME
|
124
|
+
|
125
|
+
set FILEEXT=%INPARG:~-5%
|
126
|
+
if "%FILEEXT%" == ".docx" (
|
127
|
+
set TXTFILE=%INPARG:~0,-5%.txt
|
128
|
+
) else (
|
129
|
+
set TXTFILE=%INPARG%.txt
|
130
|
+
)
|
131
|
+
|
132
|
+
|
133
|
+
::
|
134
|
+
:: Check whether output text file already exists, and whether user wants to
|
135
|
+
:: overwrite that.
|
136
|
+
::
|
137
|
+
|
138
|
+
if exist "%TXTFILE%" (
|
139
|
+
echo.
|
140
|
+
echo Output file "%TXTFILE%" already exists.
|
141
|
+
set /P confirm=Overwrite "%TXTFILE%" [Y/N - Default Y] ?
|
142
|
+
|
143
|
+
if /I "!confirm!" == "N" (
|
144
|
+
echo.
|
145
|
+
echo Please copy "%TXTFILE%" somewhere else and rerun this batch file.
|
146
|
+
echo.
|
147
|
+
goto END
|
148
|
+
)
|
149
|
+
)
|
150
|
+
|
151
|
+
|
152
|
+
::
|
153
|
+
:: Since docx2txt.pl script expects an unzipper that can send the extracted
|
154
|
+
:: file to stdout. If CakeCmd.exe is being used as unzipper, then extract the
|
155
|
+
:: contents into a directory and pass that directory as the argument to the
|
156
|
+
:: perl script.
|
157
|
+
::
|
158
|
+
|
159
|
+
if defined ARGISDIR goto CONVERT
|
160
|
+
|
161
|
+
if defined CAKECMD (
|
162
|
+
rename "%~1" "%~1.zip"
|
163
|
+
echo y | "%CAKECMD%" extract "%~1.zip" \ "%~1" > nul
|
164
|
+
set RENAMEBACK=yes
|
165
|
+
)
|
166
|
+
|
167
|
+
|
168
|
+
::
|
169
|
+
:: Invoke docx2txt.pl perl script to do the actual text extraction
|
170
|
+
::
|
171
|
+
|
172
|
+
:CONVERT
|
173
|
+
|
174
|
+
"%PERL%" "%DOCX2TXT_PL%" "%INPARG%" "%TXTFILE%"
|
175
|
+
|
176
|
+
if %ERRORLEVEL% == 2 (
|
177
|
+
echo.
|
178
|
+
echo Failed to extract text from "%~1"!
|
179
|
+
echo.
|
180
|
+
) else if %ERRORLEVEL% == 0 (
|
181
|
+
echo.
|
182
|
+
echo Text extracted from "%~1" is available in "%TXTFILE%".
|
183
|
+
echo.
|
184
|
+
)
|
185
|
+
|
186
|
+
|
187
|
+
:END
|
188
|
+
|
189
|
+
if defined RENAMEBACK (
|
190
|
+
rmdir /S /Q "%~1"
|
191
|
+
rename "%~1.zip" "%~1"
|
192
|
+
)
|
193
|
+
|
194
|
+
endlocal
|
195
|
+
endlocal
|
196
|
+
|
197
|
+
set PERL=
|
198
|
+
set DOCX2TXT_PL=
|
199
|
+
set CAKECMD=
|
200
|
+
|
201
|
+
set FILEEXT=
|
202
|
+
set INPARG=
|
203
|
+
set TXTFILE=
|
204
|
+
set ARGISDIR=
|
205
|
+
set RENAMEBACK=
|
206
|
+
set confirm=
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#
|
2
|
+
# User controllable configuration parameters for docx2txt.pl
|
3
|
+
#
|
4
|
+
# Note:
|
5
|
+
# - Ensure that all configuration lines end with single comma (,).
|
6
|
+
# - Lines beginning with '#' are comments.
|
7
|
+
#
|
8
|
+
|
9
|
+
#
|
10
|
+
# Specify the path to "unzip" command.
|
11
|
+
#
|
12
|
+
# Windows users should specify this path like
|
13
|
+
#
|
14
|
+
# 'C:\Cygwin\bin\unzip.exe' (With Windows native perl.exe)
|
15
|
+
# Or
|
16
|
+
# 'C:/Cygwin/bin/unzip.exe' (With Cygwin/Windows native perl.exe)
|
17
|
+
#
|
18
|
+
# Default : '/usr/bin/unzip'
|
19
|
+
#
|
20
|
+
unzip => '/usr/bin/unzip',
|
21
|
+
|
22
|
+
#
|
23
|
+
# How the newline should be in output text file - "\n" or "\r\n".
|
24
|
+
#
|
25
|
+
# Default : "\n"
|
26
|
+
#
|
27
|
+
# newLine => "\n",
|
28
|
+
|
29
|
+
#
|
30
|
+
# How to indent nested lists - by "\t", " " or " " etc.
|
31
|
+
#
|
32
|
+
# Default : " "
|
33
|
+
#
|
34
|
+
# listIndent => " ",
|
35
|
+
|
36
|
+
#
|
37
|
+
# Line width to use for short line (single line paragraph) justifiction.
|
38
|
+
#
|
39
|
+
# Default : 80
|
40
|
+
#
|
41
|
+
# lineWidth => 80,
|
42
|
+
|
43
|
+
#
|
44
|
+
# Show hyperlink alongside linked text - [yY/nN]
|
45
|
+
#
|
46
|
+
# Note: Even if this option is enabled, hyperlinks will be shown only if
|
47
|
+
# hyperlink differs from the linked text.
|
48
|
+
#
|
49
|
+
# Default : N
|
50
|
+
#
|
51
|
+
showHyperLink => "Y",
|
@@ -0,0 +1,387 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
|
3
|
+
# docx2txt, a command-line utility to convert Docx documents to text format.
|
4
|
+
# Copyright (C) 2008-2009 Sandeep Kumar
|
5
|
+
#
|
6
|
+
# This program is free software; you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation; either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with this program; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
+
|
20
|
+
#
|
21
|
+
# This script extracts text from document.xml contained inside .docx file.
|
22
|
+
# Perl v5.8.2 was used for testing this script.
|
23
|
+
#
|
24
|
+
# Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
|
25
|
+
#
|
26
|
+
# ChangeLog :
|
27
|
+
#
|
28
|
+
# 10/08/2008 - Initial version (v0.1)
|
29
|
+
# 15/08/2008 - Script takes two arguments [second optional] now and can be
|
30
|
+
# used independently to extract text from docx file. It accepts
|
31
|
+
# docx file directly, instead of xml file.
|
32
|
+
# 18/08/2008 - Added support for center and right justification of text that
|
33
|
+
# fits in a line 80 characters wide (adjustable).
|
34
|
+
# 03/09/2008 - Fixed the slip in usage message.
|
35
|
+
# 12/09/2008 - Slightly changed the script invocation and argument handling
|
36
|
+
# to incorporate some of the shell script functionality here.
|
37
|
+
# Added support to handle embedded urls in docx document.
|
38
|
+
# 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
|
39
|
+
# Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
|
40
|
+
# during installation.
|
41
|
+
# 31/08/2009 - Added support for handling more escape characters.
|
42
|
+
# Using OS specific null device to redirect stderr.
|
43
|
+
# Saving text file in binary mode.
|
44
|
+
# 03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov
|
45
|
+
# (sergei>AT<dewia>DOT<com).
|
46
|
+
# - removal of non-document text in between TOC related tags.
|
47
|
+
# - display of hyperlink alongside linked text user controlled.
|
48
|
+
# - some character conversion updates
|
49
|
+
# 05/09/2009 - Merged cjustify and rjustify into single subroutine justify.
|
50
|
+
# Added more character conversions.
|
51
|
+
# Organised conversion mappings in tabular form for speedup and
|
52
|
+
# easy maintenance.
|
53
|
+
# Tweaked code to reduce number of passes over document content.
|
54
|
+
# 10/09/2009 - For leaner text experience, hyperlink is not displayed if
|
55
|
+
# hyperlink and hyperlinked text are same, even if user has
|
56
|
+
# enabled hyperlink display.
|
57
|
+
# Improved handling of short line justification. Many
|
58
|
+
# justification tag patterns were not captured earlier.
|
59
|
+
# 11/09/2009 - A directory holding the unzipped content of .docx file can
|
60
|
+
# also be specified as argument to the script, in place of file.
|
61
|
+
# 17/09/2009 - Removed trailing slashes from input directory name.
|
62
|
+
# Updated unzip command invocations to handle path names
|
63
|
+
# containing spaces.
|
64
|
+
# 01/10/2009 - Added support for configuration file.
|
65
|
+
# 02/10/2009 - Using single quotes to specify path for unzip command.
|
66
|
+
# 04/10/2009 - Corrected configuration option name lineIndent to listIndent.
|
67
|
+
#
|
68
|
+
|
69
|
+
|
70
|
+
#
|
71
|
+
# The default settings below can be overridden via docx2txt.config - searched
|
72
|
+
# first in current directory and then in the same location as this script.
|
73
|
+
#
|
74
|
+
|
75
|
+
our $unzip = '/usr/bin/unzip'; # Windows path like 'C:/path/to/unzip.exe'
|
76
|
+
our $newLine = "\n"; # Alternative is "\r\n".
|
77
|
+
our $listIndent = " "; # Indent nested lists by "\t", " " etc.
|
78
|
+
our $lineWidth = 80; # Line width, used for short line justification.
|
79
|
+
our $showHyperLink = "N"; # Show hyperlink alongside linked text.
|
80
|
+
|
81
|
+
|
82
|
+
# ToDo: Better list handling. Currently assumed 8 level nesting.
|
83
|
+
my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--');
|
84
|
+
|
85
|
+
#
|
86
|
+
# Character conversion tables
|
87
|
+
#
|
88
|
+
|
89
|
+
# Only amp, gt and lt are required for docx escapes, others are used for better
|
90
|
+
# text experience.
|
91
|
+
my %escChrs = ( amp => '&', gt => '>', lt => '<',
|
92
|
+
acute => '\'', brvbar => '|', copy => '(C)', divide => '/',
|
93
|
+
laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',
|
94
|
+
reg => '(R)', shy => '-', times => 'x'
|
95
|
+
);
|
96
|
+
|
97
|
+
my %splchars = (
|
98
|
+
"\xC2\xA0" => ' ', # <nbsp>
|
99
|
+
"\xC2\xA6" => '|', # <brokenbar>
|
100
|
+
"\xC2\xA9" => '(C)', # <copyright>
|
101
|
+
"\xC2\xAB" => '<<', # <laquo>
|
102
|
+
"\xC2\xAC" => '-', # <negate>
|
103
|
+
"\xC2\xAE" => '(R)', # <regd>
|
104
|
+
"\xC2\xB1" => '+-', # <plusminus>
|
105
|
+
"\xC2\xBB" => '>>', # <raquo>
|
106
|
+
|
107
|
+
# "\xC2\xA7" => '', # <section>
|
108
|
+
# "\xC2\xB6" => '', # <para>
|
109
|
+
|
110
|
+
"\xC3\x97" => 'x', # <mul>
|
111
|
+
"\xC3\xB7" => '/', # <div>
|
112
|
+
|
113
|
+
"\xE2\x80\x82" => ' ', # <enspc>
|
114
|
+
"\xE2\x80\x83" => ' ', # <emspc>
|
115
|
+
"\xE2\x80\x85" => ' ', # <qemsp>
|
116
|
+
"\xE2\x80\x93" => ' - ', # <endash>
|
117
|
+
"\xE2\x80\x94" => ' -- ', # <emdash>
|
118
|
+
"\xE2\x80\x98" => '`', # <soq>
|
119
|
+
"\xE2\x80\x99" => '\'', # <scq>
|
120
|
+
"\xE2\x80\x9C" => '"', # <doq>
|
121
|
+
"\xE2\x80\x9D" => '"', # <dcq>
|
122
|
+
"\xE2\x80\xA2" => '::', # <diamond symbol>
|
123
|
+
"\xE2\x80\xA6" => '...', # <ellipsis>
|
124
|
+
|
125
|
+
"\xE2\x84\xA2" => '(TM)', # <trademark>
|
126
|
+
|
127
|
+
"\xE2\x89\xA0" => '!=', # <neq>
|
128
|
+
"\xE2\x89\xA4" => '<=', # <leq>
|
129
|
+
"\xE2\x89\xA5" => '>=', # <geq>
|
130
|
+
|
131
|
+
#
|
132
|
+
# Currency symbols
|
133
|
+
#
|
134
|
+
"\xC2\xA2" => 'cent',
|
135
|
+
"\xC2\xA3" => 'Pound',
|
136
|
+
"\xC2\xA5" => 'Yen',
|
137
|
+
"\xE2\x82\xAC" => 'Euro'
|
138
|
+
);
|
139
|
+
|
140
|
+
|
141
|
+
#
|
142
|
+
# Check argument(s) sanity.
|
143
|
+
#
|
144
|
+
|
145
|
+
my $usage = <<USAGE;
|
146
|
+
|
147
|
+
Usage: $0 <infile.docx> [outfile.txt|-]
|
148
|
+
|
149
|
+
Use '-' as the outfile name to dump the text on STDOUT.
|
150
|
+
Output is saved in infile.txt if second argument is omitted.
|
151
|
+
|
152
|
+
infile.docx can also be a directory name holding the unzipped content
|
153
|
+
of concerned .docx file.
|
154
|
+
|
155
|
+
USAGE
|
156
|
+
|
157
|
+
die $usage if (@ARGV == 0 || @ARGV > 2);
|
158
|
+
|
159
|
+
|
160
|
+
#
|
161
|
+
# Check for existence and readability of required file in specified directory,
|
162
|
+
# and whether it is a text file.
|
163
|
+
#
|
164
|
+
|
165
|
+
sub check_for_required_file_in_folder {
|
166
|
+
stat("$_[1]/$_[0]");
|
167
|
+
die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _);
|
168
|
+
die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _;
|
169
|
+
}
|
170
|
+
|
171
|
+
sub readFileInto
|
172
|
+
{
|
173
|
+
local $/ = undef;
|
174
|
+
open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
|
175
|
+
binmode $fh;
|
176
|
+
$_[1] = <$fh>;
|
177
|
+
close $fh;
|
178
|
+
}
|
179
|
+
|
180
|
+
|
181
|
+
#
|
182
|
+
# Check whether first argument is specifying a directory holding extracted
|
183
|
+
# content of .docx file, or .docx file itself.
|
184
|
+
#
|
185
|
+
|
186
|
+
stat($ARGV[0]);
|
187
|
+
|
188
|
+
if (-d _) {
|
189
|
+
check_for_required_file_in_folder("word/document.xml", $ARGV[0]);
|
190
|
+
check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]);
|
191
|
+
$inpIsDir = 'y';
|
192
|
+
}
|
193
|
+
else {
|
194
|
+
die "Can't read docx file <$ARGV[0]>!\n" if ! (-f _ && -r _);
|
195
|
+
die "<$ARGV[0]> does not seem to be docx file!\n" if -T _;
|
196
|
+
}
|
197
|
+
|
198
|
+
|
199
|
+
#
|
200
|
+
# Get user configuration, if any.
|
201
|
+
#
|
202
|
+
|
203
|
+
my %config;
|
204
|
+
|
205
|
+
if (-f "docx2txt.config") {
|
206
|
+
%config = do 'docx2txt.config';
|
207
|
+
} elsif ($0 =~ m%^(.*[/\\])[^/\\]*?$%) {
|
208
|
+
%config = do "$1docx2txt.config" if (-f "$1docx2txt.config");
|
209
|
+
}
|
210
|
+
|
211
|
+
if (%config) {
|
212
|
+
foreach my $var (keys %config) {
|
213
|
+
$$var = $config{$var};
|
214
|
+
}
|
215
|
+
}
|
216
|
+
|
217
|
+
|
218
|
+
#
|
219
|
+
# Extract xml document content from argument docx file/directory.
|
220
|
+
#
|
221
|
+
|
222
|
+
if ($ENV{OS} =~ /^Windows/) {
|
223
|
+
$nulldevice = "nul";
|
224
|
+
} else {
|
225
|
+
$nulldevice = "/dev/null";
|
226
|
+
}
|
227
|
+
|
228
|
+
if ($inpIsDir eq 'y') {
|
229
|
+
readFileInto("$ARGV[0]/word/document.xml", $content);
|
230
|
+
} else {
|
231
|
+
$content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
|
232
|
+
}
|
233
|
+
|
234
|
+
die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
|
235
|
+
|
236
|
+
|
237
|
+
#
|
238
|
+
# Be ready for outputting the extracted text contents.
|
239
|
+
#
|
240
|
+
|
241
|
+
if (@ARGV == 1) {
|
242
|
+
$ARGV[1] = $ARGV[0];
|
243
|
+
|
244
|
+
# Remove any trailing slashes to generate proper output filename, when
|
245
|
+
# input is directory.
|
246
|
+
$ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y');
|
247
|
+
|
248
|
+
$ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);
|
249
|
+
}
|
250
|
+
|
251
|
+
my $txtfile;
|
252
|
+
open($txtfile, "> $ARGV[1]") || die "Can't create <$ARGV[1]> for output!\n";
|
253
|
+
binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
|
254
|
+
|
255
|
+
|
256
|
+
#
|
257
|
+
# Gather information about header, footer, hyperlinks, images, footnotes etc.
|
258
|
+
#
|
259
|
+
|
260
|
+
if ($inpIsDir eq 'y') {
|
261
|
+
readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
|
262
|
+
} else {
|
263
|
+
$_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
|
264
|
+
}
|
265
|
+
|
266
|
+
my %docurels;
|
267
|
+
while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g)
|
268
|
+
{
|
269
|
+
$docurels{"$2:$1"} = $3;
|
270
|
+
}
|
271
|
+
|
272
|
+
|
273
|
+
#
|
274
|
+
# Subroutines for center and right justification of text in a line.
|
275
|
+
#
|
276
|
+
|
277
|
+
sub justify {
|
278
|
+
my $len = length $_[1];
|
279
|
+
|
280
|
+
if ($_[0] eq "center" && $len < ($lineWidth - 1)) {
|
281
|
+
return ' ' x (($lineWidth - $len) / 2) . $_[1];
|
282
|
+
} elsif ($_[0] eq "right" && $len < $lineWidth) {
|
283
|
+
return ' ' x ($lineWidth - $len) . $_[1];
|
284
|
+
} else {
|
285
|
+
return $_[1];
|
286
|
+
}
|
287
|
+
}
|
288
|
+
|
289
|
+
#
|
290
|
+
# Subroutines for dealing with embedded links and images
|
291
|
+
#
|
292
|
+
|
293
|
+
sub hyperlink {
|
294
|
+
my $hlrid = $_[0];
|
295
|
+
my $hltext = $_[1];
|
296
|
+
my $hlink = $docurels{"hyperlink:$hlrid"};
|
297
|
+
|
298
|
+
$hltext =~ s/<[^>]*?>//og;
|
299
|
+
$hltext .= " [HYPERLINK: $hlink]" if ($showHyperLink eq "y" && $hltext ne $hlink);
|
300
|
+
|
301
|
+
return $hltext;
|
302
|
+
}
|
303
|
+
|
304
|
+
#
|
305
|
+
# Subroutines for processing paragraph content.
|
306
|
+
#
|
307
|
+
|
308
|
+
sub processParagraph {
|
309
|
+
my $para = $_[0] . "$newLine";
|
310
|
+
my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/);
|
311
|
+
|
312
|
+
$para =~ s/<.*?>//og;
|
313
|
+
return justify($align,$para) if $align;
|
314
|
+
|
315
|
+
return $para;
|
316
|
+
}
|
317
|
+
|
318
|
+
|
319
|
+
#
|
320
|
+
# Force configuration value to lowercase as expected by script.
|
321
|
+
#
|
322
|
+
$showHyperLink = lc $showHyperLink;
|
323
|
+
|
324
|
+
|
325
|
+
#
|
326
|
+
# Text extraction starts.
|
327
|
+
#
|
328
|
+
|
329
|
+
my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");
|
330
|
+
|
331
|
+
$content =~ s/<?xml .*?\?>(\r)?\n//;
|
332
|
+
|
333
|
+
# Remove stuff between TOC related tags.
|
334
|
+
if ($content =~ m|<w:pStyle w:val="TOCHeading"/>|) {
|
335
|
+
$content =~ s|<w:instrText[^>]*>.*?</w:instrText>||og;
|
336
|
+
}
|
337
|
+
|
338
|
+
$content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og;
|
339
|
+
|
340
|
+
my $hr = '-' x $lineWidth . $newLine;
|
341
|
+
$content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;
|
342
|
+
|
343
|
+
$content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . "$levchar[$1] "|oge;
|
344
|
+
|
345
|
+
#
|
346
|
+
# Uncomment either of below two lines and comment above line, if dealing
|
347
|
+
# with more than 8 level nested lists.
|
348
|
+
#
|
349
|
+
|
350
|
+
# $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . '* '|oge;
|
351
|
+
# $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' '|oge;
|
352
|
+
|
353
|
+
$content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;
|
354
|
+
|
355
|
+
$content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;
|
356
|
+
|
357
|
+
$content =~ s/<w:p [^>]+?>(.*?)<\/w:p>/processParagraph($1)/oge;
|
358
|
+
|
359
|
+
$content =~ s{<w:p [^/>]+?/>|</w:p>|<w:br/>}|$newLine|og;
|
360
|
+
$content =~ s/<.*?>//og;
|
361
|
+
|
362
|
+
|
363
|
+
#
|
364
|
+
# Convert non-ASCII characters/character sequences to ASCII characters.
|
365
|
+
#
|
366
|
+
|
367
|
+
$content =~ s/(\xE2..|\xC2.|\xC3.)/($splchars{$1} ? $splchars{$1} : $1)/oge;
|
368
|
+
|
369
|
+
#
|
370
|
+
# Convert docx specific escape chars first.
|
371
|
+
#
|
372
|
+
$content =~ s/(&)(amp|gt|lt)(;)/$escChrs{lc $2}/iog;
|
373
|
+
|
374
|
+
#
|
375
|
+
# Another pass for a better text experience, after sequences like "&laquo;"
|
376
|
+
# are converted to "«".
|
377
|
+
#
|
378
|
+
$content =~ s/((&)([a-z]+)(;))/($escChrs{lc $3} ? $escChrs{lc $3} : $1)/ioge;
|
379
|
+
|
380
|
+
|
381
|
+
#
|
382
|
+
# Write the extracted and converted text contents to output.
|
383
|
+
#
|
384
|
+
|
385
|
+
print $txtfile $content;
|
386
|
+
close $txtfile;
|
387
|
+
|