doc2text 0.3.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e795071f6df878e0427e2728848d170ecb1277d0
4
- data.tar.gz: 4c02f808c1d6c47bf7edf8f1d81fc534bdf2ab93
2
+ SHA256:
3
+ metadata.gz: e803f834de30ec59e70080fa881b47e4647e467f1d443cca097aaa726d80d48a
4
+ data.tar.gz: 5ff4569486e0a8f59a089918abd5f3025e46bf3970b7e7c5d50e6d2bd38e9718
5
5
  SHA512:
6
- metadata.gz: d7a9314e425c425dc228e4f742874f3393ec77ab604e22ead973cb1430e5a01bd7395a5e8f6b6f2c71114aaad2736b5d43abcad30bb797a17a0ce842f6474ecd
7
- data.tar.gz: c82f62f3fabd0ccdf42ec6fb23d06cb3f6884676184e6a44e670e4a9f803b7fa2bc71ebd107ff067315e1ac52214c79c0477d9fd0a0388a0010f5c70fd915678
6
+ metadata.gz: 29766bf3c446cd231277da1d8f41f6d3e2c8c8b46e01f58acefe9b62f123646de7757680cd94687637ef06a439bb69a8066a623fc25235a97e70970d311886dd
7
+ data.tar.gz: 585d4d505d4ffa9c9885e813debe512edefda10943fdf1693388838c858830e0a4802f12716fbc956ac8b8de4c0899892d9b35a9a8fe32e1e3707ff3c8fd7e00
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
4
+ require 'doc2text'
5
+ require 'optparse'
6
+
7
+ options = {}
8
+ opt_parse = OptionParser.new do |opts|
9
+ opts.banner = "Usage: doc2text document.odt output.md
10
+ or: doc2text [OPTION]... -s input[.odt] -o output[.md]\n\n"
11
+
12
+ opts.on('-s FILE', '--source FILE', 'Odt FILE document to be processed') do |file|
13
+ options[:source] = file
14
+ end
15
+
16
+ opts.on('-o FILE', '--output FILE', 'Output to markdown document FILE') do |file|
17
+ options[:output] = file
18
+ end
19
+
20
+ opts.on_tail('-h', '--help', 'Show this message') do
21
+ puts opts
22
+ exit
23
+ end
24
+ end
25
+
26
+ begin
27
+ opt_parse.parse!
28
+ if options.empty?
29
+ if ARGV.size == 2
30
+ options[:source], options[:output] = *ARGV
31
+ else
32
+ puts opt_parse
33
+ exit
34
+ end
35
+ end
36
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument
37
+ puts $!.to_s
38
+ puts opt_parse
39
+ exit
40
+ end
41
+
42
+
43
+ Doc2Text::Resolution.parse_and_save options[:source], options[:output]
@@ -2,10 +2,21 @@ require 'nokogiri'
2
2
  #require 'nokogiri/xml'
3
3
  require 'fileutils'
4
4
 
5
- require 'doc2text/odt'
6
- require 'doc2text/odt_xml_node'
7
- require 'doc2text/odt_xml_namespaces'
8
- require 'doc2text/markdown_odt_parser'
5
+ require 'doc2text/xml_based_document_file'
6
+ require 'doc2text/generic_xml_nodes'
7
+ require 'doc2text/resolution'
9
8
  require 'doc2text/errors'
10
9
 
11
- require 'doc2text/content'
10
+ require 'doc2text/odt/odt'
11
+ require 'doc2text/odt/odt_xml_namespaces'
12
+ require 'doc2text/odt/markdown_odt_parser'
13
+
14
+ require 'doc2text/docx/docx'
15
+ require 'doc2text/docx/markdown_docx_parser'
16
+ require 'doc2text/docx/docx_xml_namespaces'
17
+
18
+ require 'doc2text/pptx/pptx'
19
+ require 'doc2text/pptx/markdown_pptx_parser'
20
+ require 'doc2text/pptx/pptx_xml_namespaces'
21
+
22
+ require 'doc2text/styles_parser'
@@ -0,0 +1,31 @@
1
+ module Doc2Text
2
+ module Docx
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def self.parse_and_save(input, output_filename)
6
+ docx = new input
7
+ begin
8
+ docx.unpack
9
+ styles_xml_root = docx.parse_styles
10
+ output = File.open output_filename, 'w'
11
+ markdown = Markdown::DocxParser.new output, styles_xml_root
12
+ begin
13
+ docx.parse markdown
14
+ ensure
15
+ markdown.close
16
+ end
17
+ ensure
18
+ docx.clean
19
+ end
20
+ end
21
+
22
+ def contains_extracted_files?
23
+ File.exist? File.join(extract_path, '[Content_Types].xml')
24
+ end
25
+
26
+ def extract_extension
27
+ 'unpacked_docx'
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,55 @@
1
+ module Doc2Text
2
+ module Docx
3
+ module XmlNodes
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
15
+
16
+ def body?
17
+ false
18
+ end
19
+ end
20
+
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
22
+ def body?
23
+ false
24
+ end
25
+ end
26
+
27
+ class Generic < Node
28
+ end
29
+
30
+ module W
31
+ class Wbody < Node
32
+ def body?
33
+ true
34
+ end
35
+ end
36
+
37
+ class Wbr < Node
38
+ def open
39
+ '<br/>'
40
+ end
41
+ end
42
+
43
+ class Wp < Node
44
+ def open
45
+ "\n"
46
+ end
47
+
48
+ def close
49
+ "\n"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,81 @@
1
+ require 'logger'
2
+
3
+ module Doc2Text
4
+ module Markdown
5
+ class DocxParser < Nokogiri::XML::SAX::Document
6
+ def initialize(output, styles_xml_root = nil)
7
+ @styles_xml_root = styles_xml_root
8
+ @output = output
9
+ @automatic_styles = {}
10
+ end
11
+
12
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
13
+ unless @xml_root
14
+ @xml_root = @current_node = Docx::XmlNodes::Node.create_node prefix, name, nil, attrs, self
15
+ else
16
+ new_node = Docx::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
17
+ @current_node.children << new_node
18
+ @current_node = new_node
19
+ end
20
+ end
21
+
22
+ def end_element_namespace(name, prefix = nil, uri = nil)
23
+ if @current_node.parent and @current_node.parent.body?
24
+ @output << @current_node.expand
25
+ @current_node.delete
26
+ end
27
+ @current_node = @current_node.parent
28
+ end
29
+
30
+ def characters(string)
31
+ unless string.strip.empty?
32
+ plain_text = Docx::XmlNodes::PlainText.new(string)
33
+ @current_node.children << plain_text
34
+ end
35
+ end
36
+
37
+ def close
38
+ @output.close
39
+ end
40
+
41
+ def print_tree(node)
42
+ puts node
43
+ node.children.each do |child|
44
+ print_tree child
45
+ end
46
+ end
47
+
48
+ # Select nodes xpath style
49
+ # - supports selecting from the root node
50
+ def xpath(string)
51
+ patterns = string.split '|'
52
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
53
+ result = []
54
+ patterns.each do |pattern|
55
+ if /^(\/[\w:\-]+)+$/ =~ pattern
56
+ path = pattern.scan /[\w:\-]+/
57
+ result += xpath_search_nodes(path, @xml_root)
58
+ result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
59
+ else
60
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
61
+ end
62
+ end
63
+ result
64
+ end
65
+
66
+ def xpath_search_nodes(path, xml_root)
67
+ seek_nodes = [xml_root]
68
+ path.each_with_index do |xml_name, index|
69
+ seek_nodes.select! { |node| node.xml_name == xml_name }
70
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
71
+ break if seek_nodes.empty?
72
+ end
73
+ seek_nodes
74
+ end
75
+
76
+ def logger
77
+ @logger ||= Logger.new(STDOUT)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,78 @@
1
+ module Doc2Text
2
+ module XmlBasedDocument
3
+ module XmlNodes
4
+ class Node
5
+ attr_reader :parent, :children, :attrs, :prefix, :name
6
+ attr_accessor :text
7
+
8
+ def self.inherited(subclass)
9
+ def subclass.titleize(tag)
10
+ tag.split('-').map(&:capitalize).join
11
+ end
12
+ end
13
+
14
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
15
+ @parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
16
+ @children = []
17
+ @has_text = false
18
+ end
19
+
20
+ def root?
21
+ !@parent
22
+ end
23
+
24
+ def has_text?
25
+ @has_text
26
+ end
27
+
28
+ def open
29
+ ''
30
+ end
31
+
32
+ def close
33
+ ''
34
+ end
35
+
36
+ def delete
37
+ return true unless @children
38
+ @children.each { |child| child.delete }
39
+ @children = []
40
+ end
41
+
42
+ def eql?(object)
43
+ return false unless object.is_a? Node
44
+ object.xml_name == xml_name
45
+ end
46
+
47
+ def generic?
48
+ instance_of? Node
49
+ end
50
+
51
+ def xml_name
52
+ "#{@prefix}:#{@name}"
53
+ end
54
+
55
+ def to_s
56
+ "#{xml_name} : #{attrs}"
57
+ end
58
+
59
+ def expand
60
+ expanded = "#{open}#{@children.map(&:expand).join}#{close}"
61
+ delete
62
+ expanded.clone
63
+ end
64
+ end
65
+
66
+ class PlainText < Node
67
+
68
+ attr_accessor :text
69
+
70
+ alias_method :expand, :text
71
+
72
+ def initialize(text)
73
+ @text = text
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,81 @@
1
+ require 'logger'
2
+
3
+ module Doc2Text
4
+ module Markdown
5
+ class OdtParser < Nokogiri::XML::SAX::Document
6
+ def initialize(output, styles_xml_root = nil)
7
+ @styles_xml_root = styles_xml_root
8
+ @output = output
9
+ @automatic_styles = {}
10
+ end
11
+
12
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
13
+ unless @xml_root
14
+ @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
15
+ else
16
+ new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
17
+ @current_node.children << new_node
18
+ @current_node = new_node
19
+ end
20
+ end
21
+
22
+ def end_element_namespace(name, prefix = nil, uri = nil)
23
+ if @current_node.parent and @current_node.parent.office_text?
24
+ @output.write @current_node.expand
25
+ @current_node.delete
26
+ end
27
+ @current_node = @current_node.parent
28
+ end
29
+
30
+ def characters(string)
31
+ unless string.strip.empty?
32
+ plain_text = Odt::XmlNodes::PlainText.new(string)
33
+ @current_node.children << plain_text
34
+ end
35
+ end
36
+
37
+ def close
38
+ @output.close
39
+ end
40
+
41
+ def print_tree(node)
42
+ puts node
43
+ node.children.each do |child|
44
+ print_tree child
45
+ end
46
+ end
47
+
48
+ # Select nodes xpath style
49
+ # - supports selecting from the root node
50
+ def xpath(string)
51
+ patterns = string.split '|'
52
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
53
+ result = []
54
+ patterns.each do |pattern|
55
+ if /^(\/[\w:\-]+)+$/ =~ pattern
56
+ path = pattern.scan /[\w:\-]+/
57
+ result += xpath_search_nodes(path, @xml_root)
58
+ result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
59
+ else
60
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
61
+ end
62
+ end
63
+ result
64
+ end
65
+
66
+ def xpath_search_nodes(path, xml_root)
67
+ seek_nodes = [xml_root]
68
+ path.each_with_index do |xml_name, index|
69
+ seek_nodes.select! { |node| node.xml_name == xml_name }
70
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
71
+ break if seek_nodes.empty?
72
+ end
73
+ seek_nodes
74
+ end
75
+
76
+ def logger
77
+ @logger ||= Logger.new(STDOUT)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,43 @@
1
+ module Doc2Text
2
+ module Odt
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def extract_extension
6
+ 'unpacked_odt'
7
+ end
8
+
9
+ def self.parse_and_save(input, output_filename)
10
+ odt = new input
11
+ begin
12
+ odt.unpack
13
+ styles_xml_root = odt.parse_styles
14
+ output = File.open output_filename, 'w'
15
+ markdown = Markdown::OdtParser.new output, styles_xml_root
16
+ begin
17
+ odt.parse markdown
18
+ ensure
19
+ markdown.close
20
+ end
21
+ ensure
22
+ odt.clean
23
+ end
24
+ end
25
+
26
+ def parse_styles
27
+ styles_parser = Doc2Text::Odt::StylesParser.new
28
+ xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
29
+ xml.parse open 'styles.xml'
30
+ styles_parser.xml_root
31
+ end
32
+
33
+ def parse(markdown)
34
+ parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
35
+ parser.parse open 'content.xml'
36
+ end
37
+
38
+ def contains_extracted_files?
39
+ [File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
40
+ end
41
+ end
42
+ end
43
+ end