doc2text 0.4.0 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/doc2text/odt/markdown_odt_parser.rb +1 -1
- data/lib/doc2text/pptx/markdown_pptx_parser.rb +55 -0
- data/lib/doc2text/pptx/pptx.rb +30 -0
- data/lib/doc2text/pptx/pptx_xml_namespaces.rb +55 -0
- data/lib/doc2text/resolution.rb +2 -0
- data/lib/doc2text.rb +4 -0
- metadata +19 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dcded1a71b2126a042cde415956262082bf8bb256102aaf8fcfe731ed125161b
|
4
|
+
data.tar.gz: af197aac8bd0bb9b40a9b9f78358e54c788f3a582acb8cb77c0e227200cfe5f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c0aed71728b8be859273e6d8b502f9007e80ab6de17666d78095f9877d4035241656c127ef705e3c7878b33cf6d26fdd00f615f8c95e4bd2998f816f584c162
|
7
|
+
data.tar.gz: 5ad185c29658a5e98537221415f77dd5efc1f9b43d28144b57be9cedd998bd5c7cf808438ae4b23f9d712f0e2bb4af6d1546897e1d5951ed9ab4e54ff4263630
|
@@ -21,7 +21,7 @@ module Doc2Text
|
|
21
21
|
|
22
22
|
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
23
|
if @current_node.parent and @current_node.parent.office_text?
|
24
|
-
@output
|
24
|
+
@output.write @current_node.expand
|
25
25
|
@current_node.delete
|
26
26
|
end
|
27
27
|
@current_node = @current_node.parent
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def self.parse_and_save(input, output_filename)
|
6
|
+
docx = new input
|
7
|
+
begin
|
8
|
+
docx.unpack
|
9
|
+
output = File.open output_filename, 'w'
|
10
|
+
markdown = Markdown::DocxParser.new output, nil
|
11
|
+
begin
|
12
|
+
docx.parse markdown
|
13
|
+
ensure
|
14
|
+
markdown.close
|
15
|
+
end
|
16
|
+
ensure
|
17
|
+
docx.clean
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def contains_extracted_files?
|
22
|
+
File.exist? File.join(extract_path, '[Content_Types].xml')
|
23
|
+
end
|
24
|
+
|
25
|
+
def extract_extension
|
26
|
+
'unpacked_pptx'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/doc2text/resolution.rb
CHANGED
data/lib/doc2text.rb
CHANGED
@@ -15,4 +15,8 @@ require 'doc2text/docx/docx'
|
|
15
15
|
require 'doc2text/docx/markdown_docx_parser'
|
16
16
|
require 'doc2text/docx/docx_xml_namespaces'
|
17
17
|
|
18
|
+
require 'doc2text/pptx/pptx'
|
19
|
+
require 'doc2text/pptx/markdown_pptx_parser'
|
20
|
+
require 'doc2text/pptx/pptx_xml_namespaces'
|
21
|
+
|
18
22
|
require 'doc2text/styles_parser'
|
metadata
CHANGED
@@ -1,58 +1,53 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Valentin
|
7
|
+
- Valentin A.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-11-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.8'
|
20
17
|
- - ">="
|
21
18
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
19
|
+
version: 1.12.5
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.13.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - "~>"
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: '1.8'
|
30
27
|
- - ">="
|
31
28
|
- !ruby/object:Gem::Version
|
32
|
-
version: 1.
|
29
|
+
version: 1.12.5
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.13.0
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: rubyzip
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
37
|
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
|
-
version:
|
40
|
-
- - ">="
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
version: 1.2.2
|
39
|
+
version: 2.3.0
|
43
40
|
type: :runtime
|
44
41
|
prerelease: false
|
45
42
|
version_requirements: !ruby/object:Gem::Requirement
|
46
43
|
requirements:
|
47
44
|
- - "~>"
|
48
45
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
50
|
-
- - ">="
|
51
|
-
- !ruby/object:Gem::Version
|
52
|
-
version: 1.2.2
|
46
|
+
version: 2.3.0
|
53
47
|
description: Parses odt to markdown
|
54
48
|
email: valentin@nalisbg.com
|
55
|
-
executables:
|
49
|
+
executables:
|
50
|
+
- doc2text
|
56
51
|
extensions: []
|
57
52
|
extra_rdoc_files: []
|
58
53
|
files:
|
@@ -66,6 +61,9 @@ files:
|
|
66
61
|
- lib/doc2text/odt/markdown_odt_parser.rb
|
67
62
|
- lib/doc2text/odt/odt.rb
|
68
63
|
- lib/doc2text/odt/odt_xml_namespaces.rb
|
64
|
+
- lib/doc2text/pptx/markdown_pptx_parser.rb
|
65
|
+
- lib/doc2text/pptx/pptx.rb
|
66
|
+
- lib/doc2text/pptx/pptx_xml_namespaces.rb
|
69
67
|
- lib/doc2text/resolution.rb
|
70
68
|
- lib/doc2text/styles_parser.rb
|
71
69
|
- lib/doc2text/xml_based_document_file.rb
|
@@ -88,8 +86,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
86
|
- !ruby/object:Gem::Version
|
89
87
|
version: '0'
|
90
88
|
requirements: []
|
91
|
-
|
92
|
-
rubygems_version: 2.7.8
|
89
|
+
rubygems_version: 3.1.2
|
93
90
|
signing_key:
|
94
91
|
specification_version: 4
|
95
92
|
summary: Translates odt to markdown
|