textractor 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # docx2txt, a command-line utility to convert Docx documents to text format.
4
+ # Copyright (C) 2008 Sandeep Kumar
5
+ #
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation; either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, write to the Free Software
18
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ #
21
+ # A simple .docx to .txt converter
22
+ #
23
+ # This script is a wrapper around core docx2txt.pl and saves text output for
24
+ # (filename or) filename.docx in filename.txt .
25
+ #
26
+ # Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
27
+ #
28
+ # ChangeLog :
29
+ #
30
+ # 10/08/2008 - Initial version (v0.1)
31
+ # 15/08/2008 - Invoking docx2txt.pl with docx document instead of xml file,
32
+ # so don't need unzip and rm actions now.
33
+ # Removed dependency on sed for generating output filename.
34
+ # 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
35
+ # Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
36
+ # during installation.
37
+ # 15/09/2009 - Added support for directory (holding unzipped content of
38
+ # .docx file) argument to keep it's usage in sync with main
39
+ # docx2txt.pl script.
40
+ # Fixed bug in condition check for input file accessibility.
41
+ #
42
+
43
+
44
+ MYLOC=`dirname "$0"` # invoked perl script docx2txt.pl is expected here.
45
+
46
+ function usage ()
47
+ {
48
+ cat << _USAGE_
49
+
50
+ Usage : $0 <file.docx>
51
+
52
+ <file.docx> can also specify a directory holding the unzipped
53
+ content of a .docx file.
54
+
55
+ _USAGE_
56
+
57
+ exit 1
58
+ }
59
+
60
+ [ $# != 1 ] && usage
61
+
62
+ #
63
+ # Remove trailing '/'s if any, when input specifies a directory.
64
+ #
65
+ shopt -s extglob
66
+ set ${1%%+(/)}
67
+
68
+ if [ -d "$1" ]
69
+ then
70
+ if ! [ -r "$1" -a -x "$1" ]
71
+ then
72
+ echo -e "\nCan't access/read input directory <$1>!\n"
73
+ exit 1
74
+ fi
75
+ elif ! [ -f "$1" -a -r "$1" -a -s "$1" ]
76
+ then
77
+ echo -e "\nCheck if <$1> exists, is readable and has non-zero size!\n"
78
+ exit 1
79
+ fi
80
+
81
+
82
+ TEXTFILE=${1/%.docx/.txt}
83
+ [ "$1" == "$TEXTFILE" ] && TEXTFILE="$1.txt"
84
+
85
+
86
+ #
87
+ # $1 : filename to check for existence
88
+ # $2 : message regarding file
89
+ #
90
+ function check_for_existence ()
91
+ {
92
+ if [ -f "$1" ]
93
+ then
94
+ read -p "overwrite $2 <$1> [y/n] ? " yn
95
+ if [ "$yn" != "y" ]
96
+ then
97
+ echo -e "\nPlease copy <$1> somewhere before running the script.\n"
98
+ echeck=1
99
+ fi
100
+ fi
101
+ }
102
+
103
+ echeck=0
104
+ check_for_existence "$TEXTFILE" "Output text file"
105
+ [ $echeck -ne 0 ] && exit 1
106
+
107
+ #
108
+ # Invoke perl script to do the actual text extraction
109
+ #
110
+
111
+ "$MYLOC/docx2txt.pl" "$1" "$TEXTFILE"
112
+ if [ $? == 0 ]
113
+ then
114
+ echo -e "\nText extracted from <$1> is available in <$TEXTFILE>.\n"
115
+ else
116
+ echo -e "\nFailed to extract text from <$1>!\n"
117
+ fi
118
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 3
10
+ version: 0.0.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-26 00:00:00 -04:00
18
+ date: 2010-07-27 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -54,6 +54,7 @@ files:
54
54
  - lib/textractor/document.rb
55
55
  - spec/document_spec.rb
56
56
  - spec/fixtures/document.doc
57
+ - spec/fixtures/document.docx
57
58
  - spec/fixtures/document.pdf
58
59
  - spec/fixtures/document.txt
59
60
  - spec/spec.opts
@@ -61,6 +62,21 @@ files:
61
62
  - spec/textractor_spec.rb
62
63
  - support/wvText.xml
63
64
  - textractor.gemspec
65
+ - vendor/docx2txt/AUTHORS
66
+ - vendor/docx2txt/BSDmakefile
67
+ - vendor/docx2txt/COPYING
68
+ - vendor/docx2txt/ChangeLog
69
+ - vendor/docx2txt/INSTALL
70
+ - vendor/docx2txt/Makefile
71
+ - vendor/docx2txt/README
72
+ - vendor/docx2txt/ToDo
73
+ - vendor/docx2txt/VERSION
74
+ - vendor/docx2txt/WInstall.bat
75
+ - vendor/docx2txt/docx2txt.bat
76
+ - vendor/docx2txt/docx2txt.config
77
+ - vendor/docx2txt/docx2txt.pl
78
+ - vendor/docx2txt/docx2txt.sh
79
+ - vendor/docx2txt/resume.docx
64
80
  has_rdoc: true
65
81
  homepage: http://github.com/mguterl/textractor
66
82
  licenses: []