textractor 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/textractor/document.rb +15 -7
- data/spec/document_spec.rb +22 -7
- data/spec/fixtures/document.docx +0 -0
- data/textractor.gemspec +19 -3
- data/vendor/docx2txt/AUTHORS +1 -0
- data/vendor/docx2txt/BSDmakefile +14 -0
- data/vendor/docx2txt/COPYING +674 -0
- data/vendor/docx2txt/ChangeLog +67 -0
- data/vendor/docx2txt/INSTALL +100 -0
- data/vendor/docx2txt/Makefile +23 -0
- data/vendor/docx2txt/README +109 -0
- data/vendor/docx2txt/ToDo +16 -0
- data/vendor/docx2txt/VERSION +1 -0
- data/vendor/docx2txt/WInstall.bat +218 -0
- data/vendor/docx2txt/docx2txt.bat +206 -0
- data/vendor/docx2txt/docx2txt.config +51 -0
- data/vendor/docx2txt/docx2txt.pl +387 -0
- data/vendor/docx2txt/docx2txt.sh +118 -0
- data/vendor/docx2txt/resume.docx +0 -0
- metadata +20 -4
@@ -0,0 +1,118 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# docx2txt, a command-line utility to convert Docx documents to text format.
|
4
|
+
# Copyright (C) 2008 Sandeep Kumar
|
5
|
+
#
|
6
|
+
# This program is free software; you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation; either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with this program; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
+
|
20
|
+
#
|
21
|
+
# A simple .docx to .txt converter
|
22
|
+
#
|
23
|
+
# This script is a wrapper around core docx2txt.pl and saves text output for
|
24
|
+
# (filename or) filename.docx in filename.txt .
|
25
|
+
#
|
26
|
+
# Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
|
27
|
+
#
|
28
|
+
# ChangeLog :
|
29
|
+
#
|
30
|
+
# 10/08/2008 - Initial version (v0.1)
|
31
|
+
# 15/08/2008 - Invoking docx2txt.pl with docx document instead of xml file,
|
32
|
+
# so don't need unzip and rm actions now.
|
33
|
+
# Removed dependency on sed for generating output filename.
|
34
|
+
# 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
|
35
|
+
# Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
|
36
|
+
# during installation.
|
37
|
+
# 15/09/2009 - Added support for directory (holding unzipped content of
|
38
|
+
# .docx file) argument to keep it's usage in sync with main
|
39
|
+
# docx2txt.pl script.
|
40
|
+
# Fixed bug in condition check for input file accessibility.
|
41
|
+
#
|
42
|
+
|
43
|
+
|
44
|
+
MYLOC=`dirname "$0"` # invoked perl script docx2txt.pl is expected here.
|
45
|
+
|
46
|
+
function usage ()
|
47
|
+
{
|
48
|
+
cat << _USAGE_
|
49
|
+
|
50
|
+
Usage : $0 <file.docx>
|
51
|
+
|
52
|
+
<file.docx> can also specify a directory holding the unzipped
|
53
|
+
content of a .docx file.
|
54
|
+
|
55
|
+
_USAGE_
|
56
|
+
|
57
|
+
exit 1
|
58
|
+
}
|
59
|
+
|
60
|
+
[ $# != 1 ] && usage
|
61
|
+
|
62
|
+
#
|
63
|
+
# Remove trailing '/'s if any, when input specifies a directory.
|
64
|
+
#
|
65
|
+
shopt -s extglob
|
66
|
+
set ${1%%+(/)}
|
67
|
+
|
68
|
+
if [ -d "$1" ]
|
69
|
+
then
|
70
|
+
if ! [ -r "$1" -a -x "$1" ]
|
71
|
+
then
|
72
|
+
echo -e "\nCan't access/read input directory <$1>!\n"
|
73
|
+
exit 1
|
74
|
+
fi
|
75
|
+
elif ! [ -f "$1" -a -r "$1" -a -s "$1" ]
|
76
|
+
then
|
77
|
+
echo -e "\nCheck if <$1> exists, is readable and has non-zero size!\n"
|
78
|
+
exit 1
|
79
|
+
fi
|
80
|
+
|
81
|
+
|
82
|
+
TEXTFILE=${1/%.docx/.txt}
|
83
|
+
[ "$1" == "$TEXTFILE" ] && TEXTFILE="$1.txt"
|
84
|
+
|
85
|
+
|
86
|
+
#
|
87
|
+
# $1 : filename to check for existence
|
88
|
+
# $2 : message regarding file
|
89
|
+
#
|
90
|
+
function check_for_existence ()
|
91
|
+
{
|
92
|
+
if [ -f "$1" ]
|
93
|
+
then
|
94
|
+
read -p "overwrite $2 <$1> [y/n] ? " yn
|
95
|
+
if [ "$yn" != "y" ]
|
96
|
+
then
|
97
|
+
echo -e "\nPlease copy <$1> somewhere before running the script.\n"
|
98
|
+
echeck=1
|
99
|
+
fi
|
100
|
+
fi
|
101
|
+
}
|
102
|
+
|
103
|
+
echeck=0
|
104
|
+
check_for_existence "$TEXTFILE" "Output text file"
|
105
|
+
[ $echeck -ne 0 ] && exit 1
|
106
|
+
|
107
|
+
#
|
108
|
+
# Invoke perl script to do the actual text extraction
|
109
|
+
#
|
110
|
+
|
111
|
+
"$MYLOC/docx2txt.pl" "$1" "$TEXTFILE"
|
112
|
+
if [ $? == 0 ]
|
113
|
+
then
|
114
|
+
echo -e "\nText extracted from <$1> is available in <$TEXTFILE>.\n"
|
115
|
+
else
|
116
|
+
echo -e "\nFailed to extract text from <$1>!\n"
|
117
|
+
fi
|
118
|
+
|
Binary file
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-07-
|
18
|
+
date: 2010-07-27 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -54,6 +54,7 @@ files:
|
|
54
54
|
- lib/textractor/document.rb
|
55
55
|
- spec/document_spec.rb
|
56
56
|
- spec/fixtures/document.doc
|
57
|
+
- spec/fixtures/document.docx
|
57
58
|
- spec/fixtures/document.pdf
|
58
59
|
- spec/fixtures/document.txt
|
59
60
|
- spec/spec.opts
|
@@ -61,6 +62,21 @@ files:
|
|
61
62
|
- spec/textractor_spec.rb
|
62
63
|
- support/wvText.xml
|
63
64
|
- textractor.gemspec
|
65
|
+
- vendor/docx2txt/AUTHORS
|
66
|
+
- vendor/docx2txt/BSDmakefile
|
67
|
+
- vendor/docx2txt/COPYING
|
68
|
+
- vendor/docx2txt/ChangeLog
|
69
|
+
- vendor/docx2txt/INSTALL
|
70
|
+
- vendor/docx2txt/Makefile
|
71
|
+
- vendor/docx2txt/README
|
72
|
+
- vendor/docx2txt/ToDo
|
73
|
+
- vendor/docx2txt/VERSION
|
74
|
+
- vendor/docx2txt/WInstall.bat
|
75
|
+
- vendor/docx2txt/docx2txt.bat
|
76
|
+
- vendor/docx2txt/docx2txt.config
|
77
|
+
- vendor/docx2txt/docx2txt.pl
|
78
|
+
- vendor/docx2txt/docx2txt.sh
|
79
|
+
- vendor/docx2txt/resume.docx
|
64
80
|
has_rdoc: true
|
65
81
|
homepage: http://github.com/mguterl/textractor
|
66
82
|
licenses: []
|