doc2text 0.3.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e795071f6df878e0427e2728848d170ecb1277d0
4
- data.tar.gz: 4c02f808c1d6c47bf7edf8f1d81fc534bdf2ab93
2
+ SHA256:
3
+ metadata.gz: e803f834de30ec59e70080fa881b47e4647e467f1d443cca097aaa726d80d48a
4
+ data.tar.gz: 5ff4569486e0a8f59a089918abd5f3025e46bf3970b7e7c5d50e6d2bd38e9718
5
5
  SHA512:
6
- metadata.gz: d7a9314e425c425dc228e4f742874f3393ec77ab604e22ead973cb1430e5a01bd7395a5e8f6b6f2c71114aaad2736b5d43abcad30bb797a17a0ce842f6474ecd
7
- data.tar.gz: c82f62f3fabd0ccdf42ec6fb23d06cb3f6884676184e6a44e670e4a9f803b7fa2bc71ebd107ff067315e1ac52214c79c0477d9fd0a0388a0010f5c70fd915678
6
+ metadata.gz: 29766bf3c446cd231277da1d8f41f6d3e2c8c8b46e01f58acefe9b62f123646de7757680cd94687637ef06a439bb69a8066a623fc25235a97e70970d311886dd
7
+ data.tar.gz: 585d4d505d4ffa9c9885e813debe512edefda10943fdf1693388838c858830e0a4802f12716fbc956ac8b8de4c0899892d9b35a9a8fe32e1e3707ff3c8fd7e00
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
4
+ require 'doc2text'
5
+ require 'optparse'
6
+
7
+ options = {}
8
+ opt_parse = OptionParser.new do |opts|
9
+ opts.banner = "Usage: doc2text document.odt output.md
10
+ or: doc2text [OPTION]... -s input[.odt] -o output[.md]\n\n"
11
+
12
+ opts.on('-s FILE', '--source FILE', 'Odt FILE document to be processed') do |file|
13
+ options[:source] = file
14
+ end
15
+
16
+ opts.on('-o FILE', '--output FILE', 'Output to markdown document FILE') do |file|
17
+ options[:output] = file
18
+ end
19
+
20
+ opts.on_tail('-h', '--help', 'Show this message') do
21
+ puts opts
22
+ exit
23
+ end
24
+ end
25
+
26
+ begin
27
+ opt_parse.parse!
28
+ if options.empty?
29
+ if ARGV.size == 2
30
+ options[:source], options[:output] = *ARGV
31
+ else
32
+ puts opt_parse
33
+ exit
34
+ end
35
+ end
36
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument
37
+ puts $!.to_s
38
+ puts opt_parse
39
+ exit
40
+ end
41
+
42
+
43
+ Doc2Text::Resolution.parse_and_save options[:source], options[:output]
@@ -2,10 +2,21 @@ require 'nokogiri'
2
2
  #require 'nokogiri/xml'
3
3
  require 'fileutils'
4
4
 
5
- require 'doc2text/odt'
6
- require 'doc2text/odt_xml_node'
7
- require 'doc2text/odt_xml_namespaces'
8
- require 'doc2text/markdown_odt_parser'
5
+ require 'doc2text/xml_based_document_file'
6
+ require 'doc2text/generic_xml_nodes'
7
+ require 'doc2text/resolution'
9
8
  require 'doc2text/errors'
10
9
 
11
- require 'doc2text/content'
10
+ require 'doc2text/odt/odt'
11
+ require 'doc2text/odt/odt_xml_namespaces'
12
+ require 'doc2text/odt/markdown_odt_parser'
13
+
14
+ require 'doc2text/docx/docx'
15
+ require 'doc2text/docx/markdown_docx_parser'
16
+ require 'doc2text/docx/docx_xml_namespaces'
17
+
18
+ require 'doc2text/pptx/pptx'
19
+ require 'doc2text/pptx/markdown_pptx_parser'
20
+ require 'doc2text/pptx/pptx_xml_namespaces'
21
+
22
+ require 'doc2text/styles_parser'
@@ -0,0 +1,31 @@
1
+ module Doc2Text
2
+ module Docx
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def self.parse_and_save(input, output_filename)
6
+ docx = new input
7
+ begin
8
+ docx.unpack
9
+ styles_xml_root = docx.parse_styles
10
+ output = File.open output_filename, 'w'
11
+ markdown = Markdown::DocxParser.new output, styles_xml_root
12
+ begin
13
+ docx.parse markdown
14
+ ensure
15
+ markdown.close
16
+ end
17
+ ensure
18
+ docx.clean
19
+ end
20
+ end
21
+
22
+ def contains_extracted_files?
23
+ File.exist? File.join(extract_path, '[Content_Types].xml')
24
+ end
25
+
26
+ def extract_extension
27
+ 'unpacked_docx'
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,55 @@
1
+ module Doc2Text
2
+ module Docx
3
+ module XmlNodes
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
15
+
16
+ def body?
17
+ false
18
+ end
19
+ end
20
+
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
22
+ def body?
23
+ false
24
+ end
25
+ end
26
+
27
+ class Generic < Node
28
+ end
29
+
30
+ module W
31
+ class Wbody < Node
32
+ def body?
33
+ true
34
+ end
35
+ end
36
+
37
+ class Wbr < Node
38
+ def open
39
+ '<br/>'
40
+ end
41
+ end
42
+
43
+ class Wp < Node
44
+ def open
45
+ "\n"
46
+ end
47
+
48
+ def close
49
+ "\n"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,81 @@
1
+ require 'logger'
2
+
3
+ module Doc2Text
4
+ module Markdown
5
+ class DocxParser < Nokogiri::XML::SAX::Document
6
+ def initialize(output, styles_xml_root = nil)
7
+ @styles_xml_root = styles_xml_root
8
+ @output = output
9
+ @automatic_styles = {}
10
+ end
11
+
12
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
13
+ unless @xml_root
14
+ @xml_root = @current_node = Docx::XmlNodes::Node.create_node prefix, name, nil, attrs, self
15
+ else
16
+ new_node = Docx::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
17
+ @current_node.children << new_node
18
+ @current_node = new_node
19
+ end
20
+ end
21
+
22
+ def end_element_namespace(name, prefix = nil, uri = nil)
23
+ if @current_node.parent and @current_node.parent.body?
24
+ @output << @current_node.expand
25
+ @current_node.delete
26
+ end
27
+ @current_node = @current_node.parent
28
+ end
29
+
30
+ def characters(string)
31
+ unless string.strip.empty?
32
+ plain_text = Docx::XmlNodes::PlainText.new(string)
33
+ @current_node.children << plain_text
34
+ end
35
+ end
36
+
37
+ def close
38
+ @output.close
39
+ end
40
+
41
+ def print_tree(node)
42
+ puts node
43
+ node.children.each do |child|
44
+ print_tree child
45
+ end
46
+ end
47
+
48
+ # Select nodes xpath style
49
+ # - supports selecting from the root node
50
+ def xpath(string)
51
+ patterns = string.split '|'
52
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
53
+ result = []
54
+ patterns.each do |pattern|
55
+ if /^(\/[\w:\-]+)+$/ =~ pattern
56
+ path = pattern.scan /[\w:\-]+/
57
+ result += xpath_search_nodes(path, @xml_root)
58
+ result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
59
+ else
60
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
61
+ end
62
+ end
63
+ result
64
+ end
65
+
66
+ def xpath_search_nodes(path, xml_root)
67
+ seek_nodes = [xml_root]
68
+ path.each_with_index do |xml_name, index|
69
+ seek_nodes.select! { |node| node.xml_name == xml_name }
70
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
71
+ break if seek_nodes.empty?
72
+ end
73
+ seek_nodes
74
+ end
75
+
76
+ def logger
77
+ @logger ||= Logger.new(STDOUT)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,78 @@
1
+ module Doc2Text
2
+ module XmlBasedDocument
3
+ module XmlNodes
4
+ class Node
5
+ attr_reader :parent, :children, :attrs, :prefix, :name
6
+ attr_accessor :text
7
+
8
+ def self.inherited(subclass)
9
+ def subclass.titleize(tag)
10
+ tag.split('-').map(&:capitalize).join
11
+ end
12
+ end
13
+
14
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
15
+ @parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
16
+ @children = []
17
+ @has_text = false
18
+ end
19
+
20
+ def root?
21
+ !@parent
22
+ end
23
+
24
+ def has_text?
25
+ @has_text
26
+ end
27
+
28
+ def open
29
+ ''
30
+ end
31
+
32
+ def close
33
+ ''
34
+ end
35
+
36
+ def delete
37
+ return true unless @children
38
+ @children.each { |child| child.delete }
39
+ @children = []
40
+ end
41
+
42
+ def eql?(object)
43
+ return false unless object.is_a? Node
44
+ object.xml_name == xml_name
45
+ end
46
+
47
+ def generic?
48
+ instance_of? Node
49
+ end
50
+
51
+ def xml_name
52
+ "#{@prefix}:#{@name}"
53
+ end
54
+
55
+ def to_s
56
+ "#{xml_name} : #{attrs}"
57
+ end
58
+
59
+ def expand
60
+ expanded = "#{open}#{@children.map(&:expand).join}#{close}"
61
+ delete
62
+ expanded.clone
63
+ end
64
+ end
65
+
66
+ class PlainText < Node
67
+
68
+ attr_accessor :text
69
+
70
+ alias_method :expand, :text
71
+
72
+ def initialize(text)
73
+ @text = text
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,81 @@
1
+ require 'logger'
2
+
3
+ module Doc2Text
4
+ module Markdown
5
+ class OdtParser < Nokogiri::XML::SAX::Document
6
+ def initialize(output, styles_xml_root = nil)
7
+ @styles_xml_root = styles_xml_root
8
+ @output = output
9
+ @automatic_styles = {}
10
+ end
11
+
12
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
13
+ unless @xml_root
14
+ @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
15
+ else
16
+ new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
17
+ @current_node.children << new_node
18
+ @current_node = new_node
19
+ end
20
+ end
21
+
22
+ def end_element_namespace(name, prefix = nil, uri = nil)
23
+ if @current_node.parent and @current_node.parent.office_text?
24
+ @output.write @current_node.expand
25
+ @current_node.delete
26
+ end
27
+ @current_node = @current_node.parent
28
+ end
29
+
30
+ def characters(string)
31
+ unless string.strip.empty?
32
+ plain_text = Odt::XmlNodes::PlainText.new(string)
33
+ @current_node.children << plain_text
34
+ end
35
+ end
36
+
37
+ def close
38
+ @output.close
39
+ end
40
+
41
+ def print_tree(node)
42
+ puts node
43
+ node.children.each do |child|
44
+ print_tree child
45
+ end
46
+ end
47
+
48
+ # Select nodes xpath style
49
+ # - supports selecting from the root node
50
+ def xpath(string)
51
+ patterns = string.split '|'
52
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
53
+ result = []
54
+ patterns.each do |pattern|
55
+ if /^(\/[\w:\-]+)+$/ =~ pattern
56
+ path = pattern.scan /[\w:\-]+/
57
+ result += xpath_search_nodes(path, @xml_root)
58
+ result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
59
+ else
60
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
61
+ end
62
+ end
63
+ result
64
+ end
65
+
66
+ def xpath_search_nodes(path, xml_root)
67
+ seek_nodes = [xml_root]
68
+ path.each_with_index do |xml_name, index|
69
+ seek_nodes.select! { |node| node.xml_name == xml_name }
70
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
71
+ break if seek_nodes.empty?
72
+ end
73
+ seek_nodes
74
+ end
75
+
76
+ def logger
77
+ @logger ||= Logger.new(STDOUT)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,43 @@
1
+ module Doc2Text
2
+ module Odt
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def extract_extension
6
+ 'unpacked_odt'
7
+ end
8
+
9
+ def self.parse_and_save(input, output_filename)
10
+ odt = new input
11
+ begin
12
+ odt.unpack
13
+ styles_xml_root = odt.parse_styles
14
+ output = File.open output_filename, 'w'
15
+ markdown = Markdown::OdtParser.new output, styles_xml_root
16
+ begin
17
+ odt.parse markdown
18
+ ensure
19
+ markdown.close
20
+ end
21
+ ensure
22
+ odt.clean
23
+ end
24
+ end
25
+
26
+ def parse_styles
27
+ styles_parser = Doc2Text::Odt::StylesParser.new
28
+ xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
29
+ xml.parse open 'styles.xml'
30
+ styles_parser.xml_root
31
+ end
32
+
33
+ def parse(markdown)
34
+ parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
35
+ parser.parse open 'content.xml'
36
+ end
37
+
38
+ def contains_extracted_files?
39
+ [File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
40
+ end
41
+ end
42
+ end
43
+ end