docx-parser 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: dc8f13a3627d0f0616472ac34111777cdd98ed11b54f147f3043b72044121e8d
4
+ data.tar.gz: 0e820cb74de1e3e33ea23073352f8db22e081d6a2eb2ffa8468e14b91e7fdf05
5
+ SHA512:
6
+ metadata.gz: b7ee4968132be5bba2e7424cde9b6cf71ef7f17e095a4ee13e3e7d43d69ee797817b9d7ac183e189e00dd7e124a10a261e70ff49b4aebcb907e1184e6e58041c
7
+ data.tar.gz: 80d8a1291c07b6284ecb49e37a4496576c52331b35448c2db34357a45c1d7eea2c377ce5a928620a1ade604e15abdbfbfe817f24bcd6a18779c1d564e9a51b41
data/LICENSE.md ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) Marcus Ortiz, http://marcusortiz.com
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,192 @@
1
+ # docx
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/docx.svg)](https://badge.fury.io/rb/docx)
4
+ [![Ruby](https://github.com/ruby-docx/docx/workflows/Ruby/badge.svg)](https://github.com/ruby-docx/docx/actions?query=workflow%3ARuby)
5
+ [![Coverage Status](https://coveralls.io/repos/github/ruby-docx/docx/badge.svg?branch=master)](https://coveralls.io/github/ruby-docx/docx?branch=master)
6
+ [![Gitter](https://badges.gitter.im/ruby-docx/community.svg)](https://gitter.im/ruby-docx/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
7
+
8
+ A ruby library/gem for interacting with `.docx` files. currently capabilities include reading paragraphs/bookmarks, inserting text at bookmarks, reading tables/rows/columns/cells and saving the document.
9
+
10
+ ## Usage
11
+
12
+ ### Prerequisites
13
+
14
+ - Ruby 2.5 or later
15
+
16
+ ### Install
17
+
18
+ Add the following line to your application's Gemfile:
19
+
20
+ ```ruby
21
+ gem 'docx'
22
+ ```
23
+
24
+ And then execute:
25
+
26
+ ```shell
27
+ bundle install
28
+ ```
29
+
30
+ Or install it yourself as:
31
+
32
+ ```shell
33
+ gem install docx
34
+ ```
35
+
36
+ ### Reading
37
+
38
+ ``` ruby
39
+ require 'docx'
40
+
41
+ # Create a Docx::Document object for our existing docx file
42
+ doc = Docx::Document.open('example.docx')
43
+
44
+ # Retrieve and display paragraphs
45
+ doc.paragraphs.each do |p|
46
+ puts p
47
+ end
48
+
49
+ # Retrieve and display bookmarks, returned as hash with bookmark names as keys and objects as values
50
+ doc.bookmarks.each_pair do |bookmark_name, bookmark_object|
51
+ puts bookmark_name
52
+ end
53
+ ```
54
+
55
+ Don't have a local file but a buffer? Docx handles those to:
56
+
57
+ ```ruby
58
+ require 'docx'
59
+
60
+ # Create a Docx::Document object from a remote file
61
+ doc = Docx::Document.open(buffer)
62
+
63
+ # Everything about reading is the same as shown above
64
+ ```
65
+
66
+ ### Rendering html
67
+ ``` ruby
68
+ require 'docx'
69
+
70
+ # Retrieve and display paragraphs as html
71
+ doc = Docx::Document.open('example.docx')
72
+ doc.paragraphs.each do |p|
73
+ puts p.to_html
74
+ end
75
+ ```
76
+
77
+ ### Reading tables
78
+
79
+ ``` ruby
80
+ require 'docx'
81
+
82
+ # Create a Docx::Document object for our existing docx file
83
+ doc = Docx::Document.open('tables.docx')
84
+
85
+ first_table = doc.tables[0]
86
+ puts first_table.row_count
87
+ puts first_table.column_count
88
+ puts first_table.rows[0].cells[0].text
89
+ puts first_table.columns[0].cells[0].text
90
+
91
+ # Iterate through tables
92
+ doc.tables.each do |table|
93
+ table.rows.each do |row| # Row-based iteration
94
+ row.cells.each do |cell|
95
+ puts cell.text
96
+ end
97
+ end
98
+
99
+ table.columns.each do |column| # Column-based iteration
100
+ column.cells.each do |cell|
101
+ puts cell.text
102
+ end
103
+ end
104
+ end
105
+ ```
106
+
107
+ ### Writing
108
+
109
+ ``` ruby
110
+ require 'docx'
111
+
112
+ # Create a Docx::Document object for our existing docx file
113
+ doc = Docx::Document.open('example.docx')
114
+
115
+ # Insert a single line of text after one of our bookmarks
116
+ doc.bookmarks['example_bookmark'].insert_text_after("Hello world.")
117
+
118
+ # Insert multiple lines of text at our bookmark
119
+ doc.bookmarks['example_bookmark_2'].insert_multiple_lines_after(['Hello', 'World', 'foo'])
120
+
121
+ # Remove paragraphs
122
+ doc.paragraphs.each do |p|
123
+ p.remove! if p.to_s =~ /TODO/
124
+ end
125
+
126
+ # Substitute text, preserving formatting
127
+ doc.paragraphs.each do |p|
128
+ p.each_text_run do |tr|
129
+ tr.substitute('_placeholder_', 'replacement value')
130
+ end
131
+ end
132
+
133
+ # Save document to specified path
134
+ doc.save('example-edited.docx')
135
+ ```
136
+
137
+ ### Writing to tables
138
+
139
+ ``` ruby
140
+ require 'docx'
141
+
142
+ # Create a Docx::Document object for our existing docx file
143
+ doc = Docx::Document.open('tables.docx')
144
+
145
+ # Iterate over each table
146
+ doc.tables.each do |table|
147
+ last_row = table.rows.last
148
+
149
+ # Copy last row and insert a new one before last row
150
+ new_row = last_row.copy
151
+ new_row.insert_before(last_row)
152
+
153
+ # Substitute text in each cell of this new row
154
+ new_row.cells.each do |cell|
155
+ cell.paragraphs.each do |paragraph|
156
+ paragraph.each_text_run do |text|
157
+ text.substitute('_placeholder_', 'replacement value')
158
+ end
159
+ end
160
+ end
161
+ end
162
+
163
+ doc.save('tables-edited.docx')
164
+ ```
165
+
166
+ ### Advanced
167
+
168
+ ``` ruby
169
+ require 'docx'
170
+
171
+ d = Docx::Document.open('example.docx')
172
+
173
+ # The Nokogiri::XML::Node on which an element is based can be accessed using #node
174
+ d.paragraphs.each do |p|
175
+ puts p.node.inspect
176
+ end
177
+
178
+ # The #xpath and #at_xpath methods are delegated to the node from the element, saving a step
179
+ p_element = d.paragraphs.first
180
+ p_children = p_element.xpath("//child::*") # selects all children
181
+ p_child = p_element.at_xpath("//child::*") # selects first child
182
+ ```
183
+
184
+ ## Development
185
+
186
+ ### todo
187
+
188
+ * Calculate element formatting based on values present in element properties as well as properties inherited from parents
189
+ * Default formatting of inserted elements to inherited values
190
+ * Implement formattable elements.
191
+ * Implement styles.
192
+ * Easier multi-line text insertion at a single bookmark (inserting paragraph nodes after the one containing the bookmark)
@@ -0,0 +1,24 @@
1
+ require 'docx/elements'
2
+
3
+ module Docx
4
+ module Elements
5
+ module Containers
6
+ module Container
7
+ # Relation methods
8
+ # TODO: Create a properties object, include Element
9
+ def properties
10
+ @node.at_xpath("./#{@properties_tag}")
11
+ end
12
+
13
+ # Erase text within an element
14
+ def blank!
15
+ @node.xpath(".//w:t").each {|t| t.content = '' }
16
+ end
17
+
18
+ def remove!
19
+ @node.remove
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,96 @@
1
+ require 'docx/containers/text_run'
2
+ require 'docx/containers/container'
3
+
4
+ module Docx
5
+ module Elements
6
+ module Containers
7
+ class Paragraph
8
+ include Container
9
+ include Elements::Element
10
+
11
+ def self.tag
12
+ 'p'
13
+ end
14
+
15
+
16
+ # Child elements: pPr, r, fldSimple, hlink, subDoc
17
+ # http://msdn.microsoft.com/en-us/library/office/ee364458(v=office.11).aspx
18
+ def initialize(node, document_properties = {})
19
+ @node = node
20
+ @properties_tag = 'pPr'
21
+ @document_properties = document_properties
22
+ @font_size = @document_properties[:font_size]
23
+ end
24
+
25
+ # Set text of paragraph
26
+ def text=(content)
27
+ if text_runs.size == 1
28
+ text_runs.first.text = content
29
+ elsif text_runs.size == 0
30
+ new_r = TextRun.create_within(self)
31
+ new_r.text = content
32
+ else
33
+ text_runs.each {|r| r.node.remove }
34
+ new_r = TextRun.create_within(self)
35
+ new_r.text = content
36
+ end
37
+ end
38
+
39
+ # Return text of paragraph
40
+ def to_s
41
+ text_runs.map(&:text).join('')
42
+ end
43
+
44
+ # Return paragraph as a <p></p> HTML fragment with formatting based on properties.
45
+ def to_html
46
+ html = ''
47
+ text_runs.each do |text_run|
48
+ html << text_run.to_html
49
+ end
50
+ styles = { 'font-size' => "#{font_size}pt" }
51
+ styles['text-align'] = alignment if alignment
52
+ html_tag(:p, content: html, styles: styles)
53
+ end
54
+
55
+
56
+ # Array of text runs contained within paragraph
57
+ def text_runs
58
+ @node.xpath('w:r|w:hyperlink').map { |r_node| Containers::TextRun.new(r_node, @document_properties) }
59
+ end
60
+
61
+ # Iterate over each text run within a paragraph
62
+ def each_text_run
63
+ text_runs.each { |tr| yield(tr) }
64
+ end
65
+
66
+ def aligned_left?
67
+ ['left', nil].include?(alignment)
68
+ end
69
+
70
+ def aligned_right?
71
+ alignment == 'right'
72
+ end
73
+
74
+ def aligned_center?
75
+ alignment == 'center'
76
+ end
77
+
78
+ def font_size
79
+ size_tag = @node.xpath('w:pPr//w:sz').first
80
+ size_tag ? size_tag.attributes['val'].value.to_i / 2 : @font_size
81
+ end
82
+
83
+ alias_method :text, :to_s
84
+
85
+ private
86
+
87
+ # Returns the alignment if any, or nil if left
88
+ def alignment
89
+ alignment_tag = @node.xpath('.//w:jc').first
90
+ alignment_tag ? alignment_tag.attributes['val'].value : nil
91
+ end
92
+
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,51 @@
1
+ require 'docx/containers/table_row'
2
+ require 'docx/containers/table_column'
3
+ require 'docx/containers/container'
4
+
5
+ module Docx
6
+ module Elements
7
+ module Containers
8
+ class Table
9
+ include Container
10
+ include Elements::Element
11
+
12
+ def self.tag
13
+ 'tbl'
14
+ end
15
+
16
+ def initialize(node)
17
+ @node = node
18
+ @properties_tag = 'tblGrid'
19
+ end
20
+
21
+ # Array of row
22
+ def rows
23
+ @node.xpath('w:tr').map {|r_node| Containers::TableRow.new(r_node) }
24
+ end
25
+
26
+ def row_count
27
+ @node.xpath('w:tr').count
28
+ end
29
+
30
+ # Array of column
31
+ def columns
32
+ columns_containers = []
33
+ (0..(column_count-1)).each do |i|
34
+ columns_containers[i] = Containers::TableColumn.new @node.xpath("w:tr//w:tc[#{i+1}]")
35
+ end
36
+ columns_containers
37
+ end
38
+
39
+ def column_count
40
+ @node.xpath('w:tblGrid/w:gridCol').count
41
+ end
42
+
43
+ # Iterate over each row within a table
44
+ def each_rows
45
+ rows.each { |r| yield(r) }
46
+ end
47
+
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,39 @@
1
+ require 'docx/containers/text_run'
2
+ require 'docx/containers/container'
3
+
4
+ module Docx
5
+ module Elements
6
+ module Containers
7
+ class TableCell
8
+ include Container
9
+ include Elements::Element
10
+
11
+ def self.tag
12
+ 'tc'
13
+ end
14
+
15
+ def initialize(node)
16
+ @node = node
17
+ @properties_tag = 'tcPr'
18
+ end
19
+
20
+ # Return text of paragraph's cell
21
+ def to_s
22
+ paragraphs.map(&:text).join('')
23
+ end
24
+
25
+ # Array of paragraphs contained within cell
26
+ def paragraphs
27
+ @node.xpath('w:p').map {|p_node| Containers::Paragraph.new(p_node) }
28
+ end
29
+
30
+ # Iterate over each text run within a paragraph's cell
31
+ def each_paragraph
32
+ paragraphs.each { |tr| yield(tr) }
33
+ end
34
+
35
+ alias_method :text, :to_s
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,29 @@
1
+ require 'docx/containers/table_cell'
2
+ require 'docx/containers/container'
3
+
4
+ module Docx
5
+ module Elements
6
+ module Containers
7
+ class TableColumn
8
+ include Container
9
+ include Elements::Element
10
+
11
+ def self.tag
12
+ 'w:gridCol'
13
+ end
14
+
15
+ def initialize(cell_nodes)
16
+ @node = ''
17
+ @properties_tag = ''
18
+ @cells = cell_nodes.map { |c_node| Containers::TableCell.new(c_node) }
19
+ end
20
+
21
+ # Array of cells contained within row
22
+ def cells
23
+ @cells
24
+ end
25
+
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,28 @@
1
+ require 'docx/containers/table_cell'
2
+ require 'docx/containers/container'
3
+
4
+ module Docx
5
+ module Elements
6
+ module Containers
7
+ class TableRow
8
+ include Container
9
+ include Elements::Element
10
+
11
+ def self.tag
12
+ 'tr'
13
+ end
14
+
15
+ def initialize(node)
16
+ @node = node
17
+ @properties_tag = ''
18
+ end
19
+
20
+ # Array of cells contained within row
21
+ def cells
22
+ @node.xpath('w:tc').map {|c_node| Containers::TableCell.new(c_node) }
23
+ end
24
+
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,115 @@
1
+ require 'docx/containers/container'
2
+
3
+ module Docx
4
+ module Elements
5
+ module Containers
6
+ class TextRun
7
+ include Container
8
+ include Elements::Element
9
+
10
+ DEFAULT_FORMATTING = {
11
+ italic: false,
12
+ bold: false,
13
+ underline: false
14
+ }
15
+
16
+ def self.tag
17
+ 'r'
18
+ end
19
+
20
+ attr_reader :text
21
+ attr_reader :formatting
22
+ attr_reader :text_nodes
23
+
24
+ def initialize(node, document_properties = {})
25
+ @node = node
26
+ @text_nodes = @node.xpath('w:t').map {|t_node| Elements::Text.new(t_node) }
27
+ @text_nodes = @node.xpath('w:t|w:r/w:t').map {|t_node| Elements::Text.new(t_node) }
28
+
29
+ @properties_tag = 'rPr'
30
+ @text = parse_text || ''
31
+ @formatting = parse_formatting || DEFAULT_FORMATTING
32
+ @document_properties = document_properties
33
+ @font_size = @document_properties[:font_size]
34
+ end
35
+
36
+ # Set text of text run
37
+ def text=(content)
38
+ if @text_nodes.size == 1
39
+ @text_nodes.first.content = content
40
+ elsif @text_nodes.empty?
41
+ new_t = Elements::Text.create_within(self)
42
+ new_t.content = content
43
+ end
44
+ end
45
+
46
+ # Returns text contained within text run
47
+ def parse_text
48
+ @text_nodes.map(&:content).join('')
49
+ end
50
+
51
+ # Substitute text in text @text_nodes
52
+ def substitute(match, replacement)
53
+ @text_nodes.each do |text_node|
54
+ text_node.content = text_node.content.gsub(match, replacement)
55
+ end
56
+ end
57
+
58
+ def parse_formatting
59
+ {
60
+ italic: !@node.xpath('.//w:i').empty?,
61
+ bold: !@node.xpath('.//w:b').empty?,
62
+ underline: !@node.xpath('.//w:u').empty?
63
+ }
64
+ end
65
+
66
+ def to_s
67
+ @text
68
+ end
69
+
70
+ # Return text as a HTML fragment with formatting based on properties.
71
+ def to_html
72
+ html = @text
73
+ html = html_tag(:em, content: html) if italicized?
74
+ html = html_tag(:strong, content: html) if bolded?
75
+ styles = {}
76
+ styles['text-decoration'] = 'underline' if underlined?
77
+ # No need to be granular with font size down to the span level if it doesn't vary.
78
+ styles['font-size'] = "#{font_size}pt" if font_size != @font_size
79
+ html = html_tag(:span, content: html, styles: styles) unless styles.empty?
80
+ html = html_tag(:a, content: html, attributes: {href: href, target: "_blank"}) if hyperlink?
81
+ return html
82
+ end
83
+
84
+ def italicized?
85
+ @formatting[:italic]
86
+ end
87
+
88
+ def bolded?
89
+ @formatting[:bold]
90
+ end
91
+
92
+ def underlined?
93
+ @formatting[:underline]
94
+ end
95
+
96
+ def hyperlink?
97
+ @node.name == 'hyperlink'
98
+ end
99
+
100
+ def href
101
+ @document_properties[:hyperlinks][hyperlink_id]
102
+ end
103
+
104
+ def hyperlink_id
105
+ @node.attributes['id'].value
106
+ end
107
+
108
+ def font_size
109
+ size_tag = @node.xpath('w:rPr//w:sz').first
110
+ size_tag ? size_tag.attributes['val'].value.to_i / 2 : @font_size
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,4 @@
1
+ require 'docx/containers/container'
2
+ require 'docx/containers/text_run'
3
+ require 'docx/containers/paragraph'
4
+ require 'docx/containers/table'
@@ -0,0 +1,172 @@
1
+ unless Object.const_defined?("ActiveSupport")
2
+ class Module
3
+ # Provides a delegate class method to easily expose contained objects' public methods
4
+ # as your own. Pass one or more methods (specified as symbols or strings)
5
+ # and the name of the target object via the <tt>:to</tt> option (also a symbol
6
+ # or string). At least one method and the <tt>:to</tt> option are required.
7
+ #
8
+ # Delegation is particularly useful with Active Record associations:
9
+ #
10
+ # class Greeter < ActiveRecord::Base
11
+ # def hello
12
+ # 'hello'
13
+ # end
14
+ #
15
+ # def goodbye
16
+ # 'goodbye'
17
+ # end
18
+ # end
19
+ #
20
+ # class Foo < ActiveRecord::Base
21
+ # belongs_to :greeter
22
+ # delegate :hello, to: :greeter
23
+ # end
24
+ #
25
+ # Foo.new.hello # => "hello"
26
+ # Foo.new.goodbye # => NoMethodError: undefined method `goodbye' for #<Foo:0x1af30c>
27
+ #
28
+ # Multiple delegates to the same target are allowed:
29
+ #
30
+ # class Foo < ActiveRecord::Base
31
+ # belongs_to :greeter
32
+ # delegate :hello, :goodbye, to: :greeter
33
+ # end
34
+ #
35
+ # Foo.new.goodbye # => "goodbye"
36
+ #
37
+ # Methods can be delegated to instance variables, class variables, or constants
38
+ # by providing them as a symbols:
39
+ #
40
+ # class Foo
41
+ # CONSTANT_ARRAY = [0,1,2,3]
42
+ # @@class_array = [4,5,6,7]
43
+ #
44
+ # def initialize
45
+ # @instance_array = [8,9,10,11]
46
+ # end
47
+ # delegate :sum, to: :CONSTANT_ARRAY
48
+ # delegate :min, to: :@@class_array
49
+ # delegate :max, to: :@instance_array
50
+ # end
51
+ #
52
+ # Foo.new.sum # => 6
53
+ # Foo.new.min # => 4
54
+ # Foo.new.max # => 11
55
+ #
56
+ # It's also possible to delegate a method to the class by using +:class+:
57
+ #
58
+ # class Foo
59
+ # def self.hello
60
+ # "world"
61
+ # end
62
+ #
63
+ # delegate :hello, to: :class
64
+ # end
65
+ #
66
+ # Foo.new.hello # => "world"
67
+ #
68
+ # Delegates can optionally be prefixed using the <tt>:prefix</tt> option. If the value
69
+ # is <tt>true</tt>, the delegate methods are prefixed with the name of the object being
70
+ # delegated to.
71
+ #
72
+ # Person = Struct.new(:name, :address)
73
+ #
74
+ # class Invoice < Struct.new(:client)
75
+ # delegate :name, :address, to: :client, prefix: true
76
+ # end
77
+ #
78
+ # john_doe = Person.new('John Doe', 'Vimmersvej 13')
79
+ # invoice = Invoice.new(john_doe)
80
+ # invoice.client_name # => "John Doe"
81
+ # invoice.client_address # => "Vimmersvej 13"
82
+ #
83
+ # It is also possible to supply a custom prefix.
84
+ #
85
+ # class Invoice < Struct.new(:client)
86
+ # delegate :name, :address, to: :client, prefix: :customer
87
+ # end
88
+ #
89
+ # invoice = Invoice.new(john_doe)
90
+ # invoice.customer_name # => 'John Doe'
91
+ # invoice.customer_address # => 'Vimmersvej 13'
92
+ #
93
+ # If the delegate object is +nil+ an exception is raised, and that happens
94
+ # no matter whether +nil+ responds to the delegated method. You can get a
95
+ # +nil+ instead with the +:allow_nil+ option.
96
+ #
97
+ # class Foo
98
+ # attr_accessor :bar
99
+ # def initialize(bar = nil)
100
+ # @bar = bar
101
+ # end
102
+ # delegate :zoo, to: :bar
103
+ # end
104
+ #
105
+ # Foo.new.zoo # raises NoMethodError exception (you called nil.zoo)
106
+ #
107
+ # class Foo
108
+ # attr_accessor :bar
109
+ # def initialize(bar = nil)
110
+ # @bar = bar
111
+ # end
112
+ # delegate :zoo, to: :bar, allow_nil: true
113
+ # end
114
+ #
115
+ # Foo.new.zoo # returns nil
116
+ def delegate(*methods)
117
+ options = methods.pop
118
+ unless options.is_a?(Hash) && to = options[:to]
119
+ raise ArgumentError, 'Delegation needs a target. Supply an options hash with a :to key as the last argument (e.g. delegate :hello, to: :greeter).'
120
+ end
121
+
122
+ prefix, allow_nil = options.values_at(:prefix, :allow_nil)
123
+
124
+ if prefix == true && to =~ /^[^a-z_]/
125
+ raise ArgumentError, 'Can only automatically set the delegation prefix when delegating to a method.'
126
+ end
127
+
128
+ method_prefix = \
129
+ if prefix
130
+ "#{prefix == true ? to : prefix}_"
131
+ else
132
+ ''
133
+ end
134
+
135
+ file, line = caller.first.split(':', 2)
136
+ line = line.to_i
137
+
138
+ to = to.to_s
139
+ to = 'self.class' if to == 'class'
140
+
141
+ methods.each do |method|
142
+ # Attribute writer methods only accept one argument. Makes sure []=
143
+ # methods still accept two arguments.
144
+ definition = (method =~ /[^\]]=$/) ? 'arg' : '*args, &block'
145
+
146
+ if allow_nil
147
+ module_eval(<<-EOS, file, line - 2)
148
+ def #{method_prefix}#{method}(#{definition}) # def customer_name(*args, &block)
149
+ if #{to} || #{to}.respond_to?(:#{method}) # if client || client.respond_to?(:name)
150
+ #{to}.#{method}(#{definition}) # client.name(*args, &block)
151
+ end # end
152
+ end # end
153
+ EOS
154
+ else
155
+ exception = %(raise "#{self}##{method_prefix}#{method} delegated to #{to}.#{method}, but #{to} is nil: \#{self.inspect}")
156
+
157
+ module_eval(<<-EOS, file, line - 1)
158
+ def #{method_prefix}#{method}(#{definition}) # def customer_name(*args, &block)
159
+ #{to}.#{method}(#{definition}) # client.name(*args, &block)
160
+ rescue NoMethodError # rescue NoMethodError
161
+ if #{to}.nil? # if client.nil?
162
+ #{exception} # # add helpful message to the exception
163
+ else # else
164
+ raise # raise
165
+ end # end
166
+ end # end
167
+ EOS
168
+ end
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,208 @@
1
+ require 'docx/containers'
2
+ require 'docx/elements'
3
+ require 'nokogiri'
4
+ require 'zip'
5
+
6
+ module Docx
7
+ # The Document class wraps around a docx file and provides methods to
8
+ # interface with it.
9
+ #
10
+ # # get a Docx::Document for a docx file in the local directory
11
+ # doc = Docx::Document.open("test.docx")
12
+ #
13
+ # # get the text from the document
14
+ # puts doc.text
15
+ #
16
+ # # do the same thing in a block
17
+ # Docx::Document.open("test.docx") do |d|
18
+ # puts d.text
19
+ # end
20
+ class Document
21
+ attr_reader :xml, :doc, :zip, :styles
22
+
23
+ def initialize(path_or_io, options = {})
24
+ @replace = {}
25
+
26
+ # if path-or_io is string && does not contain a null byte
27
+ if (path_or_io.instance_of?(String) && !/\u0000/.match?(path_or_io))
28
+ @zip = Zip::File.open(path_or_io)
29
+ else
30
+ @zip = Zip::File.open_buffer(path_or_io)
31
+ end
32
+
33
+ document = @zip.glob('word/document*.xml').first
34
+ raise Errno::ENOENT if document.nil?
35
+
36
+ @document_xml = document.get_input_stream.read
37
+ @doc = Nokogiri::XML(@document_xml)
38
+ load_styles
39
+ yield(self) if block_given?
40
+ ensure
41
+ @zip.close
42
+ end
43
+
44
+ # This stores the current global document properties, for now
45
+ def document_properties
46
+ {
47
+ font_size: font_size,
48
+ hyperlinks: hyperlinks
49
+ }
50
+ end
51
+
52
+ # With no associated block, Docx::Document.open is a synonym for Docx::Document.new. If the optional code block is given, it will be passed the opened +docx+ file as an argument and the Docx::Document oject will automatically be closed when the block terminates. The values of the block will be returned from Docx::Document.open.
53
+ # call-seq:
54
+ # open(filepath) => file
55
+ # open(filepath) {|file| block } => obj
56
+ def self.open(path, &block)
57
+ new(path, &block)
58
+ end
59
+
60
+ def paragraphs
61
+ @doc.xpath('//w:document//w:body/w:p').map { |p_node| parse_paragraph_from p_node }
62
+ end
63
+
64
+ def bookmarks
65
+ bkmrks_hsh = {}
66
+ bkmrks_ary = @doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node }
67
+ # auto-generated by office 2010
68
+ bkmrks_ary.reject! { |b| b.name == '_GoBack' }
69
+ bkmrks_ary.each { |b| bkmrks_hsh[b.name] = b }
70
+ bkmrks_hsh
71
+ end
72
+
73
+ def tables
74
+ @doc.xpath('//w:document//w:body//w:tbl').map { |t_node| parse_table_from t_node }
75
+ end
76
+
77
+ # Some documents have this set, others don't.
78
+ # Values are returned as half-points, so to get points, that's why it's divided by 2.
79
+ def font_size
80
+ return nil unless @styles
81
+
82
+ size_tag = @styles.xpath('//w:docDefaults//w:rPrDefault//w:rPr//w:sz').first
83
+ size_tag ? size_tag.attributes['val'].value.to_i / 2 : nil
84
+ end
85
+
86
+ # Hyperlink targets are extracted from the document.xml.rels file
87
+ def hyperlinks
88
+ hyperlink_relationships.each_with_object({}) do |rel, hash|
89
+ hash[rel.attributes['Id'].value] = rel.attributes['Target'].value
90
+ end
91
+ end
92
+
93
+ def hyperlink_relationships
94
+ @rels.xpath("//xmlns:Relationship[contains(@Type,'hyperlink')]")
95
+ end
96
+
97
+ ##
98
+ # *Deprecated*
99
+ #
100
+ # Iterates over paragraphs within document
101
+ # call-seq:
102
+ # each_paragraph => Enumerator
103
+ def each_paragraph
104
+ paragraphs.each { |p| yield(p) }
105
+ end
106
+
107
+ # call-seq:
108
+ # to_s -> string
109
+ def to_s
110
+ paragraphs.map(&:to_s).join("\n")
111
+ end
112
+
113
+ # Output entire document as a String HTML fragment
114
+ def to_html
115
+ paragraphs.map(&:to_html).join("\n")
116
+ end
117
+
118
+ # Save document to provided path
119
+ # call-seq:
120
+ # save(filepath) => void
121
+ def save(path)
122
+ update
123
+ Zip::OutputStream.open(path) do |out|
124
+ zip.each do |entry|
125
+ next unless entry.file?
126
+
127
+ out.put_next_entry(entry.name)
128
+
129
+ if @replace[entry.name]
130
+ out.write(@replace[entry.name])
131
+ else
132
+ out.write(zip.read(entry.name))
133
+ end
134
+ end
135
+ end
136
+ zip.close
137
+ end
138
+
139
+ # Output entire document as a StringIO object
140
+ def stream
141
+ update
142
+ stream = Zip::OutputStream.write_buffer do |out|
143
+ zip.each do |entry|
144
+ next unless entry.file?
145
+
146
+ out.put_next_entry(entry.name)
147
+
148
+ if @replace[entry.name]
149
+ out.write(@replace[entry.name])
150
+ else
151
+ out.write(zip.read(entry.name))
152
+ end
153
+ end
154
+ end
155
+
156
+ stream.rewind
157
+ stream
158
+ end
159
+
160
+ alias text to_s
161
+
162
+ def replace_entry(entry_path, file_contents)
163
+ @replace[entry_path] = file_contents
164
+ end
165
+
166
+ private
167
+
168
+ def load_styles
169
+ @styles_xml = @zip.read('word/styles.xml')
170
+ @styles = Nokogiri::XML(@styles_xml)
171
+ load_rels
172
+ rescue Errno::ENOENT => e
173
+ warn e.message
174
+ nil
175
+ end
176
+
177
+ def load_rels
178
+ rels_entry = @zip.glob('word/_rels/document*.xml.rels').first
179
+ raise Errno::ENOENT unless rels_entry
180
+
181
+ @rels_xml = rels_entry.get_input_stream.read
182
+ @rels = Nokogiri::XML(@rels_xml)
183
+ end
184
+
185
+ #--
186
+ # TODO: Flesh this out to be compatible with other files
187
+ # TODO: Method to set flag on files that have been edited, probably by inserting something at the
188
+ # end of methods that make edits?
189
+ #++
190
+ def update
191
+ replace_entry 'word/document.xml', doc.serialize(save_with: 0)
192
+ end
193
+
194
+ # generate Elements::Containers::Paragraph from paragraph XML node
195
+ def parse_paragraph_from(p_node)
196
+ Elements::Containers::Paragraph.new(p_node, document_properties)
197
+ end
198
+
199
+ # generate Elements::Bookmark from bookmark XML node
200
+ def parse_bookmark_from(b_node)
201
+ Elements::Bookmark.new(b_node)
202
+ end
203
+
204
+ def parse_table_from(t_node)
205
+ Elements::Containers::Table.new(t_node)
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,79 @@
1
+ require 'docx/elements/element'
2
+
3
+ module Docx
4
+ module Elements
5
+ class Bookmark
6
+ include Element
7
+ attr_accessor :name
8
+
9
+ def self.tag
10
+ 'bookmarkStart'
11
+ end
12
+
13
+ def initialize(node)
14
+ @node = node
15
+ @name = @node['w:name']
16
+ end
17
+
18
+ # Insert text before bookmarkStart node
19
+ def insert_text_before(text)
20
+ text_run = get_run_before
21
+ text_run.text = "#{text_run.text}#{text}"
22
+ end
23
+
24
+ # Insert text after bookmarkStart node
25
+ def insert_text_after(text)
26
+ text_run = get_run_after
27
+ text_run.text = "#{text}#{text_run.text}"
28
+ end
29
+
30
+ # insert multiple lines starting with paragraph containing bookmark node.
31
+ def insert_multiple_lines(text_array)
32
+ # Hold paragraphs to be inserted into, corresponding to the index of the strings in the text array
33
+ paragraphs = []
34
+ paragraph = self.parent_paragraph
35
+ # Remove text from paragraph
36
+ paragraph.blank!
37
+ paragraphs << paragraph
38
+ for i in 0...(text_array.size - 1)
39
+ # Copy previous paragraph
40
+ new_p = paragraphs[i].copy
41
+ # Insert as sibling of previous paragraph
42
+ new_p.insert_after(paragraphs[i])
43
+ paragraphs << new_p
44
+ end
45
+
46
+ # Insert text into corresponding newly created paragraphs
47
+ paragraphs.each_index do |index|
48
+ paragraphs[index].text = text_array[index]
49
+ end
50
+ end
51
+
52
+ # Get text run immediately prior to bookmark node
53
+ def get_run_before
54
+ # at_xpath returns the first match found and preceding-sibling returns siblings in the
55
+ # order they appear in the document not the order as they appear when moving out from
56
+ # the starting node
57
+ if not (r_nodes = @node.xpath("./preceding-sibling::w:r")).empty?
58
+ r_node = r_nodes.last
59
+ Containers::TextRun.new(r_node)
60
+ else
61
+ new_r = Containers::TextRun.create_with(self)
62
+ new_r.insert_before(self)
63
+ new_r
64
+ end
65
+ end
66
+
67
+ # Get text run immediately after bookmark node
68
+ def get_run_after
69
+ if (r_node = @node.at_xpath("./following-sibling::w:r"))
70
+ Containers::TextRun.new(r_node)
71
+ else
72
+ new_r = Containers::TextRun.create_with(self)
73
+ new_r.insert_after(self)
74
+ new_r
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,105 @@
1
+ require 'nokogiri'
2
+ require 'docx/elements'
3
+ require 'docx/containers'
4
+
5
+ module Docx
6
+ module Elements
7
+ module Element
8
+ DEFAULT_TAG = ''
9
+
10
+ # Ensure that a 'tag' corresponding to the XML element that defines the element is defined
11
+ def self.included(base)
12
+ base.extend(ClassMethods)
13
+ base.const_set(:TAG, Element::DEFAULT_TAG) unless base.const_defined?(:TAG)
14
+ end
15
+
16
+ attr_accessor :node
17
+ delegate :at_xpath, :xpath, :to => :@node
18
+
19
+ # TODO: Should create a docx object from this
20
+ def parent(type = '*')
21
+ @node.at_xpath("./parent::#{type}")
22
+ end
23
+
24
+ # Get parent paragraph of element
25
+ def parent_paragraph
26
+ Elements::Containers::Paragraph.new(parent('w:p'))
27
+ end
28
+
29
+ # Insertion methods
30
+ # Insert node as last child
31
+ def append_to(element)
32
+ @node = element.node.add_child(@node)
33
+ self
34
+ end
35
+
36
+ # Insert node as first child (after properties)
37
+ def prepend_to(element)
38
+ @node = element.node.properties.add_next_sibling(@node)
39
+ self
40
+ end
41
+
42
+ def insert_after(element)
43
+ # Returns newly re-parented node
44
+ @node = element.node.add_next_sibling(@node)
45
+ self
46
+ end
47
+
48
+ def insert_before(element)
49
+ @node = element.node.add_previous_sibling(@node)
50
+ self
51
+ end
52
+
53
+ # Creation/edit methods
54
+ def copy
55
+ self.class.new(@node.dup)
56
+ end
57
+
58
+ # A method to wrap content in an HTML tag.
59
+ # Currently used in paragraph and text_run for the to_html methods
60
+ #
61
+ # content:: The base text content for the tag.
62
+ # styles:: Hash of the inline CSS styles to be applied. e.g.
63
+ # { 'font-size' => '12pt', 'text-decoration' => 'underline' }
64
+ #
65
+ def html_tag(name, options = {})
66
+ content = options[:content]
67
+ styles = options[:styles]
68
+ attributes = options[:attributes]
69
+
70
+ html = "<#{name.to_s}"
71
+
72
+ unless styles.nil? || styles.empty?
73
+ styles_array = []
74
+ styles.each do |property, value|
75
+ styles_array << "#{property.to_s}:#{value};"
76
+ end
77
+ html << " style=\"#{styles_array.join('')}\""
78
+ end
79
+
80
+ unless attributes.nil? || attributes.empty?
81
+ attributes.each do |attr_name, attr_value|
82
+ html << " #{attr_name}=\"#{attr_value}\""
83
+ end
84
+ end
85
+
86
+ html << ">"
87
+ html << content if content
88
+ html << "</#{name.to_s}>"
89
+ end
90
+
91
+ module ClassMethods
92
+ def create_with(element)
93
+ # Need to somehow get the xml document accessible here by default, but this is alright in the interim
94
+ self.new(Nokogiri::XML::Node.new("w:#{self.tag}", element.node))
95
+ end
96
+
97
+ def create_within(element)
98
+ new_element = create_with(element)
99
+ new_element.append_to(element)
100
+ new_element
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,17 @@
1
+ module Docx
2
+ module Elements
3
+ class Text
4
+ include Element
5
+ delegate :content, :content=, :to => :@node
6
+
7
+ def self.tag
8
+ 't'
9
+ end
10
+
11
+
12
+ def initialize(node)
13
+ @node = node
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,3 @@
1
+ require 'docx/elements/bookmark'
2
+ require 'docx/elements/element'
3
+ require 'docx/elements/text'
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Docx #:nodoc:
4
+ VERSION = '0.0.0'
5
+ end
data/lib/docx.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'docx/version'
2
+
3
+ module Docx #:nodoc:
4
+ autoload :Document, 'docx/document'
5
+ end
6
+
7
+ require 'docx/core_ext/module'
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: docx-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Christopher Hunt
8
+ - Marcus Ortiz
9
+ - Higgins Dragon
10
+ - Toms Mikoss
11
+ - Sebastian Wittenkamp
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+ date: 2021-08-04 00:00:00.000000000 Z
16
+ dependencies:
17
+ - !ruby/object:Gem::Dependency
18
+ name: nokogiri
19
+ requirement: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - "~>"
22
+ - !ruby/object:Gem::Version
23
+ version: '1.10'
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.10.4
27
+ type: :runtime
28
+ prerelease: false
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.10'
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 1.10.4
37
+ - !ruby/object:Gem::Dependency
38
+ name: rubyzip
39
+ requirement: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '2.0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - "~>"
49
+ - !ruby/object:Gem::Version
50
+ version: '2.0'
51
+ - !ruby/object:Gem::Dependency
52
+ name: coveralls_reborn
53
+ requirement: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '0.21'
58
+ type: :development
59
+ prerelease: false
60
+ version_requirements: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - "~>"
63
+ - !ruby/object:Gem::Version
64
+ version: '0.21'
65
+ - !ruby/object:Gem::Dependency
66
+ name: rake
67
+ requirement: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - "~>"
70
+ - !ruby/object:Gem::Version
71
+ version: '13.0'
72
+ type: :development
73
+ prerelease: false
74
+ version_requirements: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - "~>"
77
+ - !ruby/object:Gem::Version
78
+ version: '13.0'
79
+ - !ruby/object:Gem::Dependency
80
+ name: rspec
81
+ requirement: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - "~>"
84
+ - !ruby/object:Gem::Version
85
+ version: '3.7'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - "~>"
91
+ - !ruby/object:Gem::Version
92
+ version: '3.7'
93
+ description: thin wrapper around rubyzip and nokogiri as a way to get started with
94
+ docx files
95
+ email:
96
+ - jd.martinez1062@gmail.com
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - LICENSE.md
102
+ - README.md
103
+ - lib/docx.rb
104
+ - lib/docx/containers.rb
105
+ - lib/docx/containers/container.rb
106
+ - lib/docx/containers/paragraph.rb
107
+ - lib/docx/containers/table.rb
108
+ - lib/docx/containers/table_cell.rb
109
+ - lib/docx/containers/table_column.rb
110
+ - lib/docx/containers/table_row.rb
111
+ - lib/docx/containers/text_run.rb
112
+ - lib/docx/core_ext/module.rb
113
+ - lib/docx/document.rb
114
+ - lib/docx/elements.rb
115
+ - lib/docx/elements/bookmark.rb
116
+ - lib/docx/elements/element.rb
117
+ - lib/docx/elements/text.rb
118
+ - lib/docx/version.rb
119
+ homepage: https://github.com/jdmartinez1062/docx
120
+ licenses:
121
+ - MIT
122
+ metadata: {}
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: 2.5.0
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubygems_version: 3.2.22
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: a ruby library/gem for interacting with .docx files with a light modification
142
+ from the docx gem in order to allow easier use of the parsed data. Forked from docx
143
+ gem
144
+ test_files: []