doc2text 0.3.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc2text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
- - Valentin Aitken
7
+ - Valentin A.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-23 00:00:00.000000000 Z
11
+ date: 2021-01-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,56 +16,54 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.6'
20
- - - ">="
21
- - !ruby/object:Gem::Version
22
- version: 1.6.3
19
+ version: 1.11.1
23
20
  type: :runtime
24
21
  prerelease: false
25
22
  version_requirements: !ruby/object:Gem::Requirement
26
23
  requirements:
27
24
  - - "~>"
28
25
  - !ruby/object:Gem::Version
29
- version: '1.6'
30
- - - ">="
31
- - !ruby/object:Gem::Version
32
- version: 1.6.3
26
+ version: 1.11.1
33
27
  - !ruby/object:Gem::Dependency
34
28
  name: rubyzip
35
29
  requirement: !ruby/object:Gem::Requirement
36
30
  requirements:
37
31
  - - "~>"
38
32
  - !ruby/object:Gem::Version
39
- version: '1.1'
40
- - - ">="
41
- - !ruby/object:Gem::Version
42
- version: 1.1.6
33
+ version: 2.3.0
43
34
  type: :runtime
44
35
  prerelease: false
45
36
  version_requirements: !ruby/object:Gem::Requirement
46
37
  requirements:
47
38
  - - "~>"
48
39
  - !ruby/object:Gem::Version
49
- version: '1.1'
50
- - - ">="
51
- - !ruby/object:Gem::Version
52
- version: 1.1.6
40
+ version: 2.3.0
53
41
  description: Parses odt to markdown
54
- email: bostko@gmail.com
55
- executables: []
42
+ email: valentin@nalisbg.com
43
+ executables:
44
+ - doc2text
56
45
  extensions: []
57
46
  extra_rdoc_files: []
58
47
  files:
48
+ - bin/doc2text
59
49
  - lib/doc2text.rb
60
- - lib/doc2text/content.rb
50
+ - lib/doc2text/docx/docx.rb
51
+ - lib/doc2text/docx/docx_xml_namespaces.rb
52
+ - lib/doc2text/docx/markdown_docx_parser.rb
61
53
  - lib/doc2text/errors.rb
62
- - lib/doc2text/markdown_odt_parser.rb
63
- - lib/doc2text/odt.rb
64
- - lib/doc2text/odt_xml_namespaces.rb
65
- - lib/doc2text/odt_xml_node.rb
54
+ - lib/doc2text/generic_xml_nodes.rb
55
+ - lib/doc2text/odt/markdown_odt_parser.rb
56
+ - lib/doc2text/odt/odt.rb
57
+ - lib/doc2text/odt/odt_xml_namespaces.rb
58
+ - lib/doc2text/pptx/markdown_pptx_parser.rb
59
+ - lib/doc2text/pptx/pptx.rb
60
+ - lib/doc2text/pptx/pptx_xml_namespaces.rb
61
+ - lib/doc2text/resolution.rb
62
+ - lib/doc2text/styles_parser.rb
63
+ - lib/doc2text/xml_based_document_file.rb
66
64
  homepage: http://doc2text.com
67
65
  licenses:
68
- - GPL
66
+ - Apache-2.0
69
67
  metadata: {}
70
68
  post_install_message:
71
69
  rdoc_options: []
@@ -82,8 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
82
80
  - !ruby/object:Gem::Version
83
81
  version: '0'
84
82
  requirements: []
85
- rubyforge_project:
86
- rubygems_version: 2.2.2
83
+ rubygems_version: 3.1.2
87
84
  signing_key:
88
85
  specification_version: 4
89
86
  summary: Translates odt to markdown
@@ -1,25 +0,0 @@
1
- module Doc2Text
2
- module Odt
3
- module Content
4
- class Document < ::Nokogiri::XML::SAX::Document
5
- def initialize(markdown_odt_parser)
6
- @markdown_odt_parser = markdown_odt_parser
7
- end
8
-
9
- def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
10
- @markdown_odt_parser.new_node prefix, name, attrs
11
- end
12
-
13
- def end_element_namespace(name, prefix = nil, uri = nil)
14
- @markdown_odt_parser.close_node prefix, name
15
- end
16
-
17
- def characters(string)
18
- unless string.strip.empty?
19
- @markdown_odt_parser.text string
20
- end
21
- end
22
- end
23
- end
24
- end
25
- end
@@ -1,72 +0,0 @@
1
- require 'logger'
2
-
3
- module Doc2Text
4
- module Markdown
5
- class OdtParser
6
- def initialize(output)
7
- @output = output
8
- @automatic_styles = {}
9
- end
10
-
11
- def new_node(prefix, name, attrs)
12
- unless @xml_root
13
- @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
14
- else
15
- new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
16
- @current_node.children << new_node
17
- @current_node = new_node
18
- end
19
- end
20
-
21
- def close_node(prefix, name)
22
- # if Odt::XmlNodes::Node.create_node(prefix, name, nil, [], self).eql? @current_node
23
- if @current_node.parent and @current_node.parent.office_text?
24
- @output << @current_node.expand
25
- @current_node.delete
26
- end
27
- @current_node = @current_node.parent
28
- # else
29
- # # TODO remove this redundant(tree build algorithm) checks
30
- # raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
31
- # end
32
- end
33
-
34
- def text(string)
35
- plain_text = Odt::XmlNodes::PlainText.new(string)
36
- @current_node.children << plain_text
37
- end
38
-
39
- def close
40
- @output.close
41
- end
42
-
43
- def print_tree(node)
44
- puts node
45
- node.children.each do |child|
46
- print_tree child
47
- end
48
- end
49
-
50
- # Select nodes xpath style
51
- # - supports selecting from the root node
52
- def xpath(string)
53
- if /^(\/[\w:\-]+)+$/ =~ string
54
- path = string.scan /[\w:\-]+/
55
- seek_nodes = [@xml_root]
56
- path.each_with_index do |xml_name, index|
57
- seek_nodes.select! { |node| node.xml_name == xml_name }
58
- seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
59
- break if seek_nodes.empty?
60
- end
61
- seek_nodes
62
- else
63
- raise Doc2Text::XmlError, 'it does not support this xpath syntax'
64
- end
65
- end
66
-
67
- def logger
68
- @logger ||= Logger.new(STDOUT)
69
- end
70
- end
71
- end
72
- end
@@ -1,73 +0,0 @@
1
- require 'zip'
2
-
3
- module Doc2Text
4
- module Odt
5
- class Document
6
- EXTRACT_EXTENSION = 'unpacked_odt'
7
-
8
- def self.parse_and_save(input, output_filename)
9
- odt = new input
10
- begin
11
- odt.unpack
12
- output = File.open output_filename, 'w'
13
- markdown = Markdown::OdtParser.new output
14
- begin
15
- odt.parse markdown
16
- ensure
17
- markdown.close
18
- end
19
- ensure
20
- odt.clean
21
- end
22
- end
23
-
24
- def parse(markdown)
25
- content = ::Doc2Text::Odt::Content::Document.new markdown
26
- parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
27
- parser.parse open 'content.xml'
28
- end
29
-
30
- def initialize(document_path)
31
- @document_path = document_path
32
- end
33
-
34
- def unpack
35
- Zip::File.open(@document_path) {
36
- |zip_file|
37
- Dir.mkdir(extract_path)
38
- zip_file.each do |entry|
39
- zipped_file_extract_path = File.join extract_path, entry.name
40
- FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
41
- zip_file.extract entry, zipped_file_extract_path
42
- end
43
- }
44
- end
45
-
46
- def clean
47
- if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
48
- FileUtils.rm_r extract_path
49
- end
50
- end
51
-
52
- # Open file from the current odt
53
- def open(filename)
54
- File.open File.join(extract_path, filename), 'r'
55
- end
56
-
57
- # Parse xml file from the current odt
58
- def xml_file(filename, rood_node_name)
59
- Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
60
- root_node = doc.root
61
- if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
62
- raise XmlError, 'Document does not have correct root element'
63
- else
64
- open(filename)
65
- end
66
- end
67
-
68
- def extract_path
69
- File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
70
- end
71
- end
72
- end
73
- end
@@ -1,215 +0,0 @@
1
- module Doc2Text
2
- module Odt
3
- module XmlNodes
4
- class PlainText
5
- include Node
6
-
7
- attr_accessor :text
8
-
9
- alias_method :expand, :text
10
-
11
- def initialize(text)
12
- @text = text
13
- end
14
- end
15
-
16
- class Generic
17
- include Node
18
- end
19
-
20
- #
21
- # These are the namespaces available in the open document format
22
- # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
23
- #
24
- module Office
25
- class AutomaticStyles
26
- include Node
27
- end
28
-
29
- class DocumentContent
30
- include Node
31
- end
32
-
33
- class Text
34
- include Node
35
-
36
- def office_text?
37
- true
38
- end
39
- end
40
- end
41
-
42
- module Animation; end
43
- module Chart; end
44
- module Config; end
45
- module Database; end
46
- module Dr3d; end
47
- module Drawing; end
48
- module Form; end
49
- module Manifest; end
50
- module Meta; end
51
- module DataStyle; end
52
- module Presentation; end
53
- module Script; end
54
- module Table
55
- class TableRow
56
- include Node
57
-
58
- def expand
59
- header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
60
- result = "\n#{@children.map(&:expand).join.strip.gsub "\n", ''} |#{header_delimiter}"
61
- delete
62
- result
63
- end
64
- end
65
-
66
- class TableCell
67
- include Node
68
-
69
- def open
70
- ' | '
71
- end
72
- end
73
- end
74
- module Style
75
- class Style
76
- include Node
77
- end
78
-
79
- class TextProperties
80
- include Node
81
- end
82
- end
83
- module XslFoCompatible; end
84
- module SvgCompatible; end
85
- module SmilCompatible; end
86
- module Of; end
87
-
88
- module Text
89
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
90
- super parent, attrs, prefix, name
91
- @markdown_odt_parser = markdown_odt_parser
92
- style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
93
- @enclosing_style = []
94
- if style_index and fetch_style?
95
- elem_style = find_style attrs[style_index].value
96
- fetch_style elem_style
97
- end
98
- end
99
-
100
- def fetch_style?
101
- true
102
- end
103
-
104
- def fetch_style(elem_style)
105
- if elem_style
106
- elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
107
- text_property.attrs.each { |attr|
108
- if attr.prefix == 'style'
109
- if attr.localname == 'font-style-complex' && attr.value == 'italic'
110
- @enclosing_style << '_'
111
- elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
112
- @enclosing_style << '**'
113
- end
114
- end
115
- }
116
- }
117
- end
118
- end
119
-
120
- def find_style(style_name)
121
- styles = @markdown_odt_parser.xpath '/office:document-content/office:automatic-styles/style:style'
122
- styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
123
- style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
124
- end
125
-
126
- # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
127
- class P
128
- include Node
129
- include Text
130
-
131
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
132
- super parent, attrs, prefix, name, markdown_odt_parser
133
- end
134
-
135
- def self.style_family
136
- 'paragraph'
137
- end
138
-
139
- def open
140
- "\n#{@enclosing_style.join}"
141
- end
142
-
143
- def close
144
- "#{@enclosing_style.reverse.join}\n"
145
- end
146
- end
147
-
148
- class LineBreak
149
- include Node
150
-
151
- def open
152
- '<br/>'
153
- end
154
- end
155
-
156
- # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
157
- class Span
158
- include Node
159
- include Text
160
-
161
- def self.style_family
162
- 'text'
163
- end
164
-
165
- def open
166
- @enclosing_style.join
167
- end
168
-
169
- def close
170
- @enclosing_style.reverse.join
171
- end
172
- end
173
-
174
- # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
175
- class ListItem
176
- include Node
177
- include Text
178
-
179
- def expand
180
- result = "* #{@children.map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
181
- delete
182
- result.clone
183
- end
184
-
185
- def fetch_style?
186
- false
187
- end
188
- end
189
-
190
- class List
191
- include Node
192
- include Text
193
-
194
- def open
195
- "\n"
196
- end
197
-
198
- def fetch_style(elem_style)
199
- if elem_style
200
- elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
201
- text_property.attrs.each { |attr|
202
- if attr.prefix == 'style'
203
- if attr.localname == 'list-level-style-number' && attr.value == 'Numbering_20_Symbols'
204
- @enclosing_style << '_'
205
- end
206
- end
207
- }
208
- }
209
- end
210
- end
211
- end
212
- end
213
- end
214
- end
215
- end