docx2txt 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7644ad54195f4d768b27e39e24adc29cf07f6864
4
+ data.tar.gz: 7b274d12dba47626413924f8c854cc3e572cf21b
5
+ SHA512:
6
+ metadata.gz: b9cdc6b7ae36641220d68c03f4edee66825c236da676f3b17ea9c83a8cc5bb0fcedff422acc4b890a35584dbb54988444d35c62abf15b313dc2867f540dbbe42
7
+ data.tar.gz: 5fd6717ebc51e50fd0538f869706baa40002aec765bac3e9b0e2b1d3b074aec4fca621dd4ff8b7abb2854589a97f9aa34ed6d100a57d3dd4e414d26de77bb72d
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ coverage
6
+ InstalledFiles
7
+ lib/bundler/man
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+
15
+ # YARD artifacts
16
+ .yardoc
17
+ _yardoc
18
+ doc/
@@ -0,0 +1 @@
1
+ ruby-2.0.0-p353
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
@@ -0,0 +1,20 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ Docx2TXT (0.0.1)
5
+ nokogiri
6
+ rubyzip
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ mini_portile (0.5.2)
12
+ nokogiri (1.6.1)
13
+ mini_portile (~> 0.5.0)
14
+ rubyzip (1.1.0)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ Docx2TXT!
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 Thiago R. Colucci
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,32 @@
1
+ # Docx2TXT
2
+
3
+ Extract the simplest TXT (I could imagine) from a MS Docx. It just do a best effort to preserve paragraphs.
4
+
5
+ ## How
6
+
7
+ Instantiate the docx with the location of the file
8
+
9
+ doc = Docx2TXT::Docx.new file_path
10
+
11
+ Later just ask for the txt
12
+
13
+ doc.to_txt
14
+
15
+ ## Simple executable
16
+
17
+ docx2txt <docxfilepath>
18
+
19
+ <!-- ## Code status
20
+
21
+ [![Build Status](https://travis-ci.org/thoughtworks/pacto.png)](https://travis-ci.org/thoughtworks/pacto)
22
+ [![Code Climate](https://codeclimate.com/github/thoughtworks/pacto.png)](https://codeclimate.com/github/thoughtworks/pacto)
23
+ [![Dependency Status](https://gemnasium.com/thoughtworks/pacto.png)](https://gemnasium.com/thoughtworks/pacto)
24
+ [![Coverage Status](https://coveralls.io/repos/thoughtworks/pacto/badge.png)](https://coveralls.io/r/thoughtworks/pacto) -->
25
+
26
+ ## Contributing
27
+
28
+ 1. Fork it
29
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
30
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
31
+ 4. Push to the branch (`git push origin my-new-feature`)
32
+ 5. Create new Pull Request
@@ -0,0 +1,8 @@
1
+ #! /usr/bin/env ruby
2
+ require 'docx2txt'
3
+
4
+ if ARGV.empty?
5
+ puts "Usage:\n\tdocx2txt <path-to-MS-Docx-file>"
6
+ else
7
+ puts Docx2TXT::Docx.new(ARGV.first).to_txt
8
+ end
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'docx2txt/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "docx2txt"
8
+ gem.version = Docx2TXT::VERSION
9
+ gem.authors = ["Thiago Colucci"]
10
+ gem.email = ["ticolucci@gmail.com"]
11
+ gem.description = %q{Extract the simplest TXT I could imagine from a Docx. It just do a best effort to preserve paragraphs.}
12
+ gem.summary = %q{Extract the contents of MS Docx to TXT}
13
+ gem.homepage = 'https://github.com/ticolucci/docx2txt'
14
+ gem.license = 'MIT'
15
+
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ #gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+
22
+ gem.add_dependency "nokogiri"
23
+ gem.add_dependency "rubyzip"
24
+ end
@@ -0,0 +1,32 @@
1
+ require "rubygems"
2
+ require "nokogiri"
3
+ require "zip"
4
+
5
+ module Docx2TXT
6
+ class Docx
7
+ def initialize location
8
+ @location = location
9
+ end
10
+
11
+ def to_txt
12
+ get_text get_xml
13
+ end
14
+
15
+ private
16
+
17
+ def get_xml
18
+ Zip::File.new(@location).each do |entry|
19
+ return Nokogiri.XML entry.get_input_stream.readlines.join if entry.name =~ /word.document.xml/
20
+ end
21
+ end
22
+
23
+ def get_text xml
24
+ lines = xml.children.map do |c|
25
+ c.children.map do |i|
26
+ i.children.map(&:text)
27
+ end.flatten
28
+ end.flatten
29
+ lines.join("\n")
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ module Docx2TXT
2
+ VERSION = '0.0.2'
3
+ end
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: docx2txt
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Thiago Colucci
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubyzip
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Extract the simplest TXT I could imagine from a Docx. It just do a best
42
+ effort to preserve paragraphs.
43
+ email:
44
+ - ticolucci@gmail.com
45
+ executables:
46
+ - docx2txt
47
+ extensions: []
48
+ extra_rdoc_files: []
49
+ files:
50
+ - .gitignore
51
+ - .ruby-version
52
+ - Gemfile
53
+ - Gemfile.lock
54
+ - LICENSE
55
+ - README.md
56
+ - bin/docx2txt
57
+ - docx2txt.gemspec
58
+ - lib/docx2txt.rb
59
+ - lib/docx2txt/version.rb
60
+ homepage: https://github.com/ticolucci/docx2txt
61
+ licenses:
62
+ - MIT
63
+ metadata: {}
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 2.1.11
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: Extract the contents of MS Docx to TXT
84
+ test_files: []