coradoc-docx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +7 -0
  2. data/README.adoc +164 -0
  3. data/lib/coradoc/docx/transform/context.rb +72 -0
  4. data/lib/coradoc/docx/transform/from_core_model.rb +577 -0
  5. data/lib/coradoc/docx/transform/numbering_resolver.rb +127 -0
  6. data/lib/coradoc/docx/transform/ordered_content.rb +95 -0
  7. data/lib/coradoc/docx/transform/rule.rb +57 -0
  8. data/lib/coradoc/docx/transform/rule_registry.rb +60 -0
  9. data/lib/coradoc/docx/transform/rules/bookmark_rule.rb +34 -0
  10. data/lib/coradoc/docx/transform/rules/break_rule.rb +30 -0
  11. data/lib/coradoc/docx/transform/rules/footnote_rule.rb +27 -0
  12. data/lib/coradoc/docx/transform/rules/heading_rule.rb +53 -0
  13. data/lib/coradoc/docx/transform/rules/hyperlink_rule.rb +58 -0
  14. data/lib/coradoc/docx/transform/rules/image_rule.rb +125 -0
  15. data/lib/coradoc/docx/transform/rules/list_item_rule.rb +47 -0
  16. data/lib/coradoc/docx/transform/rules/math_rule.rb +82 -0
  17. data/lib/coradoc/docx/transform/rules/paragraph_rule.rb +65 -0
  18. data/lib/coradoc/docx/transform/rules/proof_error_rule.rb +25 -0
  19. data/lib/coradoc/docx/transform/rules/run_rule.rb +189 -0
  20. data/lib/coradoc/docx/transform/rules/simple_field_rule.rb +87 -0
  21. data/lib/coradoc/docx/transform/rules/structured_document_tag_rule.rb +36 -0
  22. data/lib/coradoc/docx/transform/rules/table_rule.rb +85 -0
  23. data/lib/coradoc/docx/transform/rules/text_rule.rb +25 -0
  24. data/lib/coradoc/docx/transform/style_resolver.rb +249 -0
  25. data/lib/coradoc/docx/transform/to_core_model.rb +340 -0
  26. data/lib/coradoc/docx/transform.rb +38 -0
  27. data/lib/coradoc/docx/version.rb +7 -0
  28. data/lib/coradoc/docx.rb +99 -0
  29. metadata +155 -0
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms regular paragraphs to CoreModel::Block.
8
+ #
9
+ # Handles paragraph-style detection for block types:
10
+ # quote, source, literal, example, or plain paragraph.
11
+ #
12
+ # This is the default rule for paragraphs. The orchestrator dispatches
13
+ # heading and list item paragraphs directly, so this rule only sees
14
+ # regular paragraphs.
15
+ class ParagraphRule < Rule
16
+ include OrderedContent
17
+
18
+ def priority
19
+ 0
20
+ end
21
+
22
+ def matches?(element)
23
+ defined?(Uniword::Wordprocessingml::Paragraph) &&
24
+ element.is_a?(Uniword::Wordprocessingml::Paragraph)
25
+ end
26
+
27
+ def apply(paragraph, context)
28
+ role = context.style_resolver.role_from_style(paragraph)
29
+ block_type = block_type_for(role)
30
+
31
+ children = transform_paragraph_content(paragraph, context)
32
+ id = extract_bookmark_id(paragraph)
33
+
34
+ block = CoreModel::Block.new(
35
+ element_type: block_type,
36
+ content: extract_plain_text(children)
37
+ )
38
+ block.children = children
39
+ block.id = id if id
40
+ block
41
+ end
42
+
43
+ private
44
+
45
+ def block_type_for(role)
46
+ case role
47
+ when :quote then 'quote'
48
+ when :source then 'source'
49
+ when :literal then 'literal'
50
+ when :example then 'example'
51
+ else 'paragraph'
52
+ end
53
+ end
54
+
55
+ def extract_bookmark_id(paragraph)
56
+ starts = paragraph.bookmark_starts
57
+ return nil if starts.nil? || starts.empty?
58
+
59
+ starts.first.id&.to_s
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Silently ignores w:proofErr (proofing error) elements.
8
+ #
9
+ # Proofing errors are spelling/grammar markers in OOXML that have
10
+ # no semantic representation in CoreModel. This rule matches them
11
+ # and returns nil, effectively stripping them from the output.
12
+ class ProofErrorRule < Rule
13
+ def matches?(element)
14
+ defined?(Uniword::Wordprocessingml::ProofError) &&
15
+ element.is_a?(Uniword::Wordprocessingml::ProofError)
16
+ end
17
+
18
+ def apply(_element, _context)
19
+ nil
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:r (Run) elements to InlineElement or String.
8
+ #
9
+ # Runs with formatting become CoreModel::InlineElement nodes.
10
+ # Plain runs (no formatting properties) return their text directly.
11
+ #
12
+ # A single run may carry multiple formatting properties (e.g., bold +
13
+ # italic). The most specific one wins for format_type, while the
14
+ # text content is preserved.
15
+ #
16
+ # Uses effective_run_properties (when available) to resolve style
17
+ # inheritance: explicit properties > paragraph style's rPr > basedOn chain.
18
+ # Falls back to run.properties for backward compatibility.
19
+ class RunRule < Rule
20
+ def matches?(element)
21
+ defined?(Uniword::Wordprocessingml::Run) &&
22
+ element.is_a?(Uniword::Wordprocessingml::Run)
23
+ end
24
+
25
+ def apply(run, context)
26
+ # Delegate non-text children (breaks, drawings, footnotes, etc.)
27
+ non_text = extract_non_text_children(run, context)
28
+ return non_text.first if non_text.any? && run.text.nil?
29
+
30
+ text = run.text&.content.to_s
31
+ return '' if text.empty? && non_text.empty?
32
+
33
+ props = effective_props(run)
34
+ return text if plain_run?(props)
35
+
36
+ fmt = format_type(props, run, context)
37
+ return text unless fmt
38
+
39
+ CoreModel::InlineElement.new(
40
+ format_type: fmt,
41
+ content: text
42
+ )
43
+ end
44
+
45
+ private
46
+
47
+ def effective_props(run)
48
+ ep = run.effective_run_properties
49
+ return ep if ep
50
+
51
+ run.properties
52
+ end
53
+
54
+ def extract_non_text_children(run, context)
55
+ result = []
56
+
57
+ result << context.transform(run.break) if run.break
58
+
59
+ result << context.transform(run.footnote_reference) if run.footnote_reference
60
+
61
+ result << context.transform(run.endnote_reference) if run.endnote_reference
62
+
63
+ run.drawings&.each do |drawing|
64
+ result << context.transform(drawing)
65
+ end
66
+
67
+ result << "\t" if run.tab
68
+
69
+ result << context.transform(run.o_math) if run.class.attributes.key?(:o_math) && run.o_math
70
+
71
+ if run.del_text
72
+ text = run.del_text.is_a?(Uniword::Wordprocessingml::DeletedText) ? run.del_text.content.to_s : run.del_text.to_s
73
+ unless text.empty?
74
+ result << CoreModel::InlineElement.new(
75
+ format_type: 'strikethrough',
76
+ content: text
77
+ )
78
+ end
79
+ end
80
+
81
+ if run.sym
82
+ char = run.sym.char
83
+ result << char.to_s if char && !char.empty?
84
+ end
85
+
86
+ result << "\u2011" if run.no_break_hyphen
87
+
88
+ result << "\u00AD" if run.class.attributes.key?(:soft_hyphen) && run.soft_hyphen
89
+
90
+ result << CoreModel::InlineElement.new(format_type: 'hard_line_break') if run.class.attributes.key?(:carriage_return) && run.carriage_return
91
+
92
+ if run.alternate_content
93
+ result << extract_alternate_content(run.alternate_content,
94
+ context)
95
+ end
96
+
97
+ result.compact
98
+ end
99
+
100
+ def extract_alternate_content(ac, context)
101
+ content = if ac.fallback
102
+ ac.fallback
103
+ elsif ac.choice
104
+ ac.choice
105
+ end
106
+
107
+ return nil unless content
108
+
109
+ if content.is_a?(Uniword::Wordprocessingml::Run)
110
+ content.runs&.each { |r| context.transform(r) }
111
+ elsif content.is_a?(Uniword::Wordprocessingml::Paragraph)
112
+ content.paragraphs&.flat_map { |p| context.transform(p) }
113
+ end
114
+ end
115
+
116
+ def plain_run?(props)
117
+ return true unless props
118
+
119
+ props.bold.nil? &&
120
+ props.italic.nil? &&
121
+ props.underline.nil? &&
122
+ props.strike.nil? &&
123
+ props.double_strike.nil? &&
124
+ props.vertical_align.nil? &&
125
+ props.small_caps.nil? &&
126
+ props.caps.nil? &&
127
+ props.hidden.nil? &&
128
+ props.highlight.nil?
129
+ end
130
+
131
+ # Determine the dominant format type.
132
+ # Checks rStyle-based semantic detection first, then explicit formatting.
133
+ def format_type(props, run, context)
134
+ return nil unless props
135
+
136
+ # Check rStyle for semantic role
137
+ if context.style_resolver.is_a?(Coradoc::Docx::Transform::StyleResolver)
138
+ role = context.style_resolver.run_semantic_role(run)
139
+ case role
140
+ when :monospace then return 'monospace'
141
+ when :bold then return 'bold'
142
+ when :italic then return 'italic'
143
+ end
144
+ end
145
+
146
+ # Explicit formatting properties
147
+ if bold?(props)
148
+ 'bold'
149
+ elsif italic?(props)
150
+ 'italic'
151
+ elsif props.underline
152
+ 'underline'
153
+ elsif props.strike || props.double_strike
154
+ 'strikethrough'
155
+ elsif subscript?(props)
156
+ 'subscript'
157
+ elsif superscript?(props)
158
+ 'superscript'
159
+ elsif props.small_caps
160
+ 'small'
161
+ elsif props.caps
162
+ 'bold'
163
+ elsif props.highlight
164
+ 'highlight'
165
+ elsif props.hidden
166
+ nil
167
+ end
168
+ end
169
+
170
+ def bold?(props)
171
+ props.bold && props.bold.value != false
172
+ end
173
+
174
+ def italic?(props)
175
+ props.italic && props.italic.value != false
176
+ end
177
+
178
+ def subscript?(props)
179
+ props.vertical_align&.value.to_s == 'subscript'
180
+ end
181
+
182
+ def superscript?(props)
183
+ props.vertical_align&.value.to_s == 'superscript'
184
+ end
185
+ end
186
+ end
187
+ end
188
+ end
189
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:fldSimple (simple field) elements.
8
+ #
9
+ # Simple fields include page numbers, dates, document properties,
10
+ # and other computed content. This rule extracts the field's text
11
+ # content when available, otherwise produces the instruction text.
12
+ #
13
+ # Common field types:
14
+ # - PAGE → current page number
15
+ # - NUMPAGES → total page count
16
+ # - DATE → current date
17
+ # - TIME → current time
18
+ # - DOCPROPERTY → document property value
19
+ # - TITLE → document title
20
+ # - AUTHOR → document author
21
+ class SimpleFieldRule < Rule
22
+ def matches?(element)
23
+ defined?(Uniword::Wordprocessingml::SimpleField) &&
24
+ element.is_a?(Uniword::Wordprocessingml::SimpleField)
25
+ end
26
+
27
+ def apply(field, _context)
28
+ # Try to get the resolved text content first
29
+ text = field_text(field)
30
+ return nil if text.nil? || text.empty?
31
+
32
+ # Check if this is a semantic field we should preserve
33
+ instr = field_instruction(field)
34
+ case instr
35
+ when /\A(TITLE|AUTHOR|SUBJECT|KEYWORDS|DOCPROPERTY)\b/i
36
+ # Document metadata — embed as plain text (already resolved)
37
+ text
38
+ when /\A(PAGE|NUMPAGES)\b/i
39
+ # Page layout fields — skip (not semantic)
40
+ nil
41
+ when /\A(HYPERLINK)\b/i
42
+ # Hyperlink field — extract URL and text
43
+ url = extract_hyperlink_url(instr)
44
+ if url
45
+ CoreModel::InlineElement.new(
46
+ format_type: 'link',
47
+ content: text,
48
+ target: url
49
+ )
50
+ else
51
+ text
52
+ end
53
+ when /\A(TOC|PAGEREF|REF|NOTEREF)\b/i
54
+ # TOC / cross-reference fields — skip (print layout)
55
+ nil
56
+ else
57
+ # Generic field — pass through as text
58
+ text
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def field_text(field)
65
+ if field.runs && !field.runs.empty?
66
+ return field.runs.map do |r|
67
+ r.text&.content.to_s
68
+ end.join
69
+ end
70
+
71
+ nil
72
+ end
73
+
74
+ def field_instruction(field)
75
+ instr = field.instr
76
+ instr.to_s
77
+ end
78
+
79
+ def extract_hyperlink_url(instr)
80
+ match = instr.match(/HYPERLINK\s+"([^"]+)"/i)
81
+ match&.[](1)
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:sdt (Structured Document Tag) elements.
8
+ #
9
+ # SDTs wrap content with additional metadata. The transform
10
+ # unwraps them and delegates to the content's own rules.
11
+ class StructuredDocumentTagRule < Rule
12
+ def matches?(element)
13
+ defined?(Uniword::Wordprocessingml::StructuredDocumentTag) &&
14
+ element.is_a?(Uniword::Wordprocessingml::StructuredDocumentTag)
15
+ end
16
+
17
+ def apply(sdt, context)
18
+ # SDTs contain paragraphs and tables — delegate to their rules
19
+ # via the context's transform method
20
+ return nil unless sdt.content
21
+
22
+ paragraphs = sdt.content.paragraphs || []
23
+ tables = sdt.content.tables || []
24
+
25
+ results = []
26
+ paragraphs.each { |p| results << context.transform(p) }
27
+ tables.each { |t| results << context.transform(t) }
28
+
29
+ # Return single element or array
30
+ results.one? ? results.first : results.compact
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:tbl (Table) elements to CoreModel::Table.
8
+ #
9
+ # Walks the OOXML table structure (Table → TableRow → TableCell)
10
+ # and produces the corresponding CoreModel tree.
11
+ #
12
+ # Cell paragraphs are transformed through the rule system to preserve
13
+ # inline formatting (bold, italic, links) as InlineElement objects.
14
+ # Print-layout properties (frame, grid, width) are NOT mapped — CoreModel
15
+ # is a semantic model, not a print layout language.
16
+ class TableRule < Rule
17
+ include OrderedContent
18
+
19
+ def matches?(element)
20
+ defined?(Uniword::Wordprocessingml::Table) &&
21
+ element.is_a?(Uniword::Wordprocessingml::Table)
22
+ end
23
+
24
+ def apply(table, context)
25
+ CoreModel::Table.new(
26
+ rows: table.rows.map { |r| transform_row(r, context) }
27
+ )
28
+ end
29
+
30
+ private
31
+
32
+ def transform_row(row, context)
33
+ CoreModel::TableRow.new(
34
+ cells: row.cells.map { |c| transform_cell(c, context) },
35
+ header: row.header?
36
+ )
37
+ end
38
+
39
+ def transform_cell(cell, context)
40
+ inline_children = cell_paragraphs(cell).flat_map do |para|
41
+ extract_inline_from_paragraph(para, context)
42
+ end
43
+
44
+ props = cell.properties
45
+
46
+ CoreModel::TableCell.new(
47
+ content: extract_plain_text(inline_children),
48
+ alignment: props&.vertical_align&.to_s,
49
+ colspan: cell.column_span,
50
+ rowspan: cell.row_span,
51
+ header: header_cell?(cell),
52
+ children: inline_children
53
+ )
54
+ end
55
+
56
+ # Transform a cell paragraph and extract its inline children
57
+ def extract_inline_from_paragraph(para, context)
58
+ transformed = context.transform(para)
59
+ return [] unless transformed
60
+
61
+ # If it's a Block with children (inline elements), extract them
62
+ if transformed.is_a?(CoreModel::Block) && transformed.children.any?
63
+ transformed.children
64
+ elsif transformed.is_a?(CoreModel::Block)
65
+ [transformed.content].compact
66
+ else
67
+ [transformed]
68
+ end
69
+ end
70
+
71
+ def cell_paragraphs(cell)
72
+ cell.paragraphs || []
73
+ end
74
+
75
+ def header_cell?(cell)
76
+ return false unless cell.properties
77
+
78
+ vm = cell.properties.v_merge
79
+ vm&.value.to_s == 'restart'
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:t (Text) elements to plain strings.
8
+ #
9
+ # Text is returned as a raw string — not wrapped in a CoreModel node.
10
+ # The caller (RunRule) is responsible for wrapping in InlineElement
11
+ # when formatting is present.
12
+ class TextRule < Rule
13
+ def matches?(element)
14
+ defined?(Uniword::Wordprocessingml::Text) &&
15
+ element.is_a?(Uniword::Wordprocessingml::Text)
16
+ end
17
+
18
+ def apply(text, _context)
19
+ text.content.to_s
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end