textractor 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # docx2txt, a command-line utility to convert Docx documents to text format.
4
+ # Copyright (C) 2008 Sandeep Kumar
5
+ #
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation; either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, write to the Free Software
18
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ #
21
+ # A simple .docx to .txt converter
22
+ #
23
+ # This script is a wrapper around core docx2txt.pl and saves text output for
24
+ # (filename or) filename.docx in filename.txt .
25
+ #
26
+ # Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
27
+ #
28
+ # ChangeLog :
29
+ #
30
+ # 10/08/2008 - Initial version (v0.1)
31
+ # 15/08/2008 - Invoking docx2txt.pl with docx document instead of xml file,
32
+ # so don't need unzip and rm actions now.
33
+ # Removed dependency on sed for generating output filename.
34
+ # 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
35
+ # Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
36
+ # during installation.
37
+ # 15/09/2009 - Added support for directory (holding unzipped content of
38
+ # .docx file) argument to keep it's usage in sync with main
39
+ # docx2txt.pl script.
40
+ # Fixed bug in condition check for input file accessibility.
41
+ #
42
+
43
+
44
+ MYLOC=`dirname "$0"` # invoked perl script docx2txt.pl is expected here.
45
+
46
+ function usage ()
47
+ {
48
+ cat << _USAGE_
49
+
50
+ Usage : $0 <file.docx>
51
+
52
+ <file.docx> can also specify a directory holding the unzipped
53
+ content of a .docx file.
54
+
55
+ _USAGE_
56
+
57
+ exit 1
58
+ }
59
+
60
+ [ $# != 1 ] && usage
61
+
62
+ #
63
+ # Remove trailing '/'s if any, when input specifies a directory.
64
+ #
65
+ shopt -s extglob
66
+ set ${1%%+(/)}
67
+
68
+ if [ -d "$1" ]
69
+ then
70
+ if ! [ -r "$1" -a -x "$1" ]
71
+ then
72
+ echo -e "\nCan't access/read input directory <$1>!\n"
73
+ exit 1
74
+ fi
75
+ elif ! [ -f "$1" -a -r "$1" -a -s "$1" ]
76
+ then
77
+ echo -e "\nCheck if <$1> exists, is readable and has non-zero size!\n"
78
+ exit 1
79
+ fi
80
+
81
+
82
+ TEXTFILE=${1/%.docx/.txt}
83
+ [ "$1" == "$TEXTFILE" ] && TEXTFILE="$1.txt"
84
+
85
+
86
+ #
87
+ # $1 : filename to check for existence
88
+ # $2 : message regarding file
89
+ #
90
+ function check_for_existence ()
91
+ {
92
+ if [ -f "$1" ]
93
+ then
94
+ read -p "overwrite $2 <$1> [y/n] ? " yn
95
+ if [ "$yn" != "y" ]
96
+ then
97
+ echo -e "\nPlease copy <$1> somewhere before running the script.\n"
98
+ echeck=1
99
+ fi
100
+ fi
101
+ }
102
+
103
+ echeck=0
104
+ check_for_existence "$TEXTFILE" "Output text file"
105
+ [ $echeck -ne 0 ] && exit 1
106
+
107
+ #
108
+ # Invoke perl script to do the actual text extraction
109
+ #
110
+
111
+ "$MYLOC/docx2txt.pl" "$1" "$TEXTFILE"
112
+ if [ $? == 0 ]
113
+ then
114
+ echo -e "\nText extracted from <$1> is available in <$TEXTFILE>.\n"
115
+ else
116
+ echo -e "\nFailed to extract text from <$1>!\n"
117
+ fi
118
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 3
10
+ version: 0.0.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-26 00:00:00 -04:00
18
+ date: 2010-07-27 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -54,6 +54,7 @@ files:
54
54
  - lib/textractor/document.rb
55
55
  - spec/document_spec.rb
56
56
  - spec/fixtures/document.doc
57
+ - spec/fixtures/document.docx
57
58
  - spec/fixtures/document.pdf
58
59
  - spec/fixtures/document.txt
59
60
  - spec/spec.opts
@@ -61,6 +62,21 @@ files:
61
62
  - spec/textractor_spec.rb
62
63
  - support/wvText.xml
63
64
  - textractor.gemspec
65
+ - vendor/docx2txt/AUTHORS
66
+ - vendor/docx2txt/BSDmakefile
67
+ - vendor/docx2txt/COPYING
68
+ - vendor/docx2txt/ChangeLog
69
+ - vendor/docx2txt/INSTALL
70
+ - vendor/docx2txt/Makefile
71
+ - vendor/docx2txt/README
72
+ - vendor/docx2txt/ToDo
73
+ - vendor/docx2txt/VERSION
74
+ - vendor/docx2txt/WInstall.bat
75
+ - vendor/docx2txt/docx2txt.bat
76
+ - vendor/docx2txt/docx2txt.config
77
+ - vendor/docx2txt/docx2txt.pl
78
+ - vendor/docx2txt/docx2txt.sh
79
+ - vendor/docx2txt/resume.docx
64
80
  has_rdoc: true
65
81
  homepage: http://github.com/mguterl/textractor
66
82
  licenses: []