doc2text 0.4.0 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/doc2text/odt/markdown_odt_parser.rb +1 -1
- data/lib/doc2text/pptx/markdown_pptx_parser.rb +55 -0
- data/lib/doc2text/pptx/pptx.rb +30 -0
- data/lib/doc2text/pptx/pptx_xml_namespaces.rb +55 -0
- data/lib/doc2text/resolution.rb +2 -0
- data/lib/doc2text.rb +4 -0
- metadata +19 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dcded1a71b2126a042cde415956262082bf8bb256102aaf8fcfe731ed125161b
|
4
|
+
data.tar.gz: af197aac8bd0bb9b40a9b9f78358e54c788f3a582acb8cb77c0e227200cfe5f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c0aed71728b8be859273e6d8b502f9007e80ab6de17666d78095f9877d4035241656c127ef705e3c7878b33cf6d26fdd00f615f8c95e4bd2998f816f584c162
|
7
|
+
data.tar.gz: 5ad185c29658a5e98537221415f77dd5efc1f9b43d28144b57be9cedd998bd5c7cf808438ae4b23f9d712f0e2bb4af6d1546897e1d5951ed9ab4e54ff4263630
|
@@ -21,7 +21,7 @@ module Doc2Text
|
|
21
21
|
|
22
22
|
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
23
|
if @current_node.parent and @current_node.parent.office_text?
|
24
|
-
@output
|
24
|
+
@output.write @current_node.expand
|
25
25
|
@current_node.delete
|
26
26
|
end
|
27
27
|
@current_node = @current_node.parent
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def self.parse_and_save(input, output_filename)
|
6
|
+
docx = new input
|
7
|
+
begin
|
8
|
+
docx.unpack
|
9
|
+
output = File.open output_filename, 'w'
|
10
|
+
markdown = Markdown::DocxParser.new output, nil
|
11
|
+
begin
|
12
|
+
docx.parse markdown
|
13
|
+
ensure
|
14
|
+
markdown.close
|
15
|
+
end
|
16
|
+
ensure
|
17
|
+
docx.clean
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def contains_extracted_files?
|
22
|
+
File.exist? File.join(extract_path, '[Content_Types].xml')
|
23
|
+
end
|
24
|
+
|
25
|
+
def extract_extension
|
26
|
+
'unpacked_pptx'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/doc2text/resolution.rb
CHANGED
data/lib/doc2text.rb
CHANGED
@@ -15,4 +15,8 @@ require 'doc2text/docx/docx'
|
|
15
15
|
require 'doc2text/docx/markdown_docx_parser'
|
16
16
|
require 'doc2text/docx/docx_xml_namespaces'
|
17
17
|
|
18
|
+
require 'doc2text/pptx/pptx'
|
19
|
+
require 'doc2text/pptx/markdown_pptx_parser'
|
20
|
+
require 'doc2text/pptx/pptx_xml_namespaces'
|
21
|
+
|
18
22
|
require 'doc2text/styles_parser'
|
metadata
CHANGED
@@ -1,58 +1,53 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Valentin
|
7
|
+
- Valentin A.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-11-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.8'
|
20
17
|
- - ">="
|
21
18
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
19
|
+
version: 1.12.5
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.13.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - "~>"
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: '1.8'
|
30
27
|
- - ">="
|
31
28
|
- !ruby/object:Gem::Version
|
32
|
-
version: 1.
|
29
|
+
version: 1.12.5
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.13.0
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: rubyzip
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
37
|
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
|
-
version:
|
40
|
-
- - ">="
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
version: 1.2.2
|
39
|
+
version: 2.3.0
|
43
40
|
type: :runtime
|
44
41
|
prerelease: false
|
45
42
|
version_requirements: !ruby/object:Gem::Requirement
|
46
43
|
requirements:
|
47
44
|
- - "~>"
|
48
45
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
50
|
-
- - ">="
|
51
|
-
- !ruby/object:Gem::Version
|
52
|
-
version: 1.2.2
|
46
|
+
version: 2.3.0
|
53
47
|
description: Parses odt to markdown
|
54
48
|
email: valentin@nalisbg.com
|
55
|
-
executables:
|
49
|
+
executables:
|
50
|
+
- doc2text
|
56
51
|
extensions: []
|
57
52
|
extra_rdoc_files: []
|
58
53
|
files:
|
@@ -66,6 +61,9 @@ files:
|
|
66
61
|
- lib/doc2text/odt/markdown_odt_parser.rb
|
67
62
|
- lib/doc2text/odt/odt.rb
|
68
63
|
- lib/doc2text/odt/odt_xml_namespaces.rb
|
64
|
+
- lib/doc2text/pptx/markdown_pptx_parser.rb
|
65
|
+
- lib/doc2text/pptx/pptx.rb
|
66
|
+
- lib/doc2text/pptx/pptx_xml_namespaces.rb
|
69
67
|
- lib/doc2text/resolution.rb
|
70
68
|
- lib/doc2text/styles_parser.rb
|
71
69
|
- lib/doc2text/xml_based_document_file.rb
|
@@ -88,8 +86,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
86
|
- !ruby/object:Gem::Version
|
89
87
|
version: '0'
|
90
88
|
requirements: []
|
91
|
-
|
92
|
-
rubygems_version: 2.7.8
|
89
|
+
rubygems_version: 3.1.2
|
93
90
|
signing_key:
|
94
91
|
specification_version: 4
|
95
92
|
summary: Translates odt to markdown
|