coradoc-docx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +7 -0
  2. data/README.adoc +164 -0
  3. data/lib/coradoc/docx/transform/context.rb +72 -0
  4. data/lib/coradoc/docx/transform/from_core_model.rb +577 -0
  5. data/lib/coradoc/docx/transform/numbering_resolver.rb +127 -0
  6. data/lib/coradoc/docx/transform/ordered_content.rb +95 -0
  7. data/lib/coradoc/docx/transform/rule.rb +57 -0
  8. data/lib/coradoc/docx/transform/rule_registry.rb +60 -0
  9. data/lib/coradoc/docx/transform/rules/bookmark_rule.rb +34 -0
  10. data/lib/coradoc/docx/transform/rules/break_rule.rb +30 -0
  11. data/lib/coradoc/docx/transform/rules/footnote_rule.rb +27 -0
  12. data/lib/coradoc/docx/transform/rules/heading_rule.rb +53 -0
  13. data/lib/coradoc/docx/transform/rules/hyperlink_rule.rb +58 -0
  14. data/lib/coradoc/docx/transform/rules/image_rule.rb +125 -0
  15. data/lib/coradoc/docx/transform/rules/list_item_rule.rb +47 -0
  16. data/lib/coradoc/docx/transform/rules/math_rule.rb +82 -0
  17. data/lib/coradoc/docx/transform/rules/paragraph_rule.rb +65 -0
  18. data/lib/coradoc/docx/transform/rules/proof_error_rule.rb +25 -0
  19. data/lib/coradoc/docx/transform/rules/run_rule.rb +189 -0
  20. data/lib/coradoc/docx/transform/rules/simple_field_rule.rb +87 -0
  21. data/lib/coradoc/docx/transform/rules/structured_document_tag_rule.rb +36 -0
  22. data/lib/coradoc/docx/transform/rules/table_rule.rb +85 -0
  23. data/lib/coradoc/docx/transform/rules/text_rule.rb +25 -0
  24. data/lib/coradoc/docx/transform/style_resolver.rb +249 -0
  25. data/lib/coradoc/docx/transform/to_core_model.rb +340 -0
  26. data/lib/coradoc/docx/transform.rb +38 -0
  27. data/lib/coradoc/docx/version.rb +7 -0
  28. data/lib/coradoc/docx.rb +99 -0
  29. metadata +155 -0
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ # Utility for iterating paragraph content in document order.
7
+ #
8
+ # OOXML paragraphs have separate arrays for runs, hyperlinks, SDTs, etc.
9
+ # but `element_order` (from lutaml-model mixed_content) preserves the
10
+ # interleaved sequence. This module provides a single method to walk
11
+ # paragraph content in the correct order.
12
+ #
13
+ # Used by ParagraphRule, ListItemRule, and HeadingRule.
14
+ module OrderedContent
15
+ # Iterate paragraph inline content in document order.
16
+ #
17
+ # @param paragraph [Uniword::Wordprocessingml::Paragraph]
18
+ # @param context [Context] transform context with registry
19
+ # @return [Array] transformed content (Strings, InlineElements, etc.)
20
+ def transform_paragraph_content(paragraph, context)
21
+ order = paragraph.is_a?(Uniword::Wordprocessingml::Paragraph) ? paragraph.element_order : nil
22
+
23
+ if order && !order.empty?
24
+ transform_ordered(paragraph, order, context)
25
+ else
26
+ transform_sequential(paragraph, context)
27
+ end
28
+ end
29
+
30
+ # Flatten children array to plain text string.
31
+ #
32
+ # @param children [Array] mixed content (Strings, InlineElements, Blocks)
33
+ # @return [String]
34
+ def extract_plain_text(children)
35
+ children.map do |c|
36
+ case c
37
+ when String then c
38
+ when CoreModel::InlineElement then c.content.to_s
39
+ when CoreModel::Block then c.content.to_s
40
+ else c.to_s
41
+ end
42
+ end.join
43
+ end
44
+
45
+ private
46
+
47
+ def transform_ordered(paragraph, order, context)
48
+ counters = Hash.new(0)
49
+
50
+ result = []
51
+ order.each do |entry|
52
+ name = entry.is_a?(String) ? entry : entry.name
53
+ idx = counters[name]
54
+ counters[name] = idx + 1
55
+
56
+ item = case name
57
+ when 'r'
58
+ run = paragraph.runs[idx]
59
+ context.transform(run) if run
60
+ when 'hyperlink'
61
+ hl = paragraph.hyperlinks[idx]
62
+ context.transform(hl) if hl
63
+ when 'sdt'
64
+ sdt = paragraph.structured_document_tags&.[](idx)
65
+ context.transform(sdt) if sdt
66
+ when 'oMathPara'
67
+ math = paragraph.o_math_paras&.[](idx)
68
+ context.transform(math) if math
69
+ when 'oMath'
70
+ math = paragraph.o_maths&.[](idx)
71
+ context.transform(math) if math
72
+ when 'fldSimple'
73
+ field = paragraph.simple_fields&.[](idx)
74
+ context.transform(field) if field
75
+ when 'proofErr'
76
+ # Proofing errors have no semantic value — skip silently
77
+ nil
78
+ end
79
+
80
+ result << item if item
81
+ end
82
+
83
+ result.compact
84
+ end
85
+
86
+ def transform_sequential(paragraph, context)
87
+ content = []
88
+ (paragraph.runs || []).each { |r| content << context.transform(r) }
89
+ (paragraph.hyperlinks || []).each { |h| content << context.transform(h) }
90
+ content.compact
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ # Base class for OOXML → CoreModel transform rules.
7
+ #
8
+ # Each rule handles one OOXML element type and produces a CoreModel
9
+ # node. Rules are registered in RuleRegistry and dispatched by the
10
+ # ToCoreModel orchestrator.
11
+ #
12
+ # Subclasses must implement:
13
+ # - matches?(element) → true if this rule handles the element
14
+ # - apply(element, context) → CoreModel node or Array of nodes
15
+ #
16
+ # @example Implementing a custom rule
17
+ # class MyRule < Rule
18
+ # def matches?(element)
19
+ # element.is_a?(Uniword::Wordprocessingml::MyElement)
20
+ # end
21
+ #
22
+ # def apply(element, context)
23
+ # Coradoc::CoreModel::Block.new(
24
+ # element_type: 'paragraph',
25
+ # content: element.text
26
+ # )
27
+ # end
28
+ # end
29
+ class Rule
30
+ # Check if this rule handles the given element
31
+ #
32
+ # @param element [Object] OOXML element to check
33
+ # @return [Boolean]
34
+ def matches?(element)
35
+ raise NotImplementedError, "#{self.class}#matches? not implemented"
36
+ end
37
+
38
+ # Transform an OOXML element to a CoreModel node
39
+ #
40
+ # @param element [Object] OOXML element to transform
41
+ # @param context [Context] shared transform context
42
+ # @return [Coradoc::CoreModel::Base, Array, String, nil]
43
+ def apply(element, context)
44
+ raise NotImplementedError, "#{self.class}#apply not implemented"
45
+ end
46
+
47
+ # Rule priority — higher priority rules are checked first.
48
+ # Override in subclasses when needed.
49
+ #
50
+ # @return [Integer]
51
+ def priority
52
+ 0
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ # Registry for transform rules.
7
+ #
8
+ # Manages registration and lookup of rules for OOXML element types.
9
+ # Rules are checked in priority order (highest first).
10
+ # Falls back to NullRule which raises ArgumentError.
11
+ #
12
+ # Follows Open/Closed Principle: new rules are added by registering,
13
+ # not by modifying the registry class.
14
+ class RuleRegistry
15
+ def initialize
16
+ @rules = []
17
+ end
18
+
19
+ # Register a rule instance
20
+ #
21
+ # @param rule [Rule] the rule to register
22
+ # @return [self]
23
+ def register(rule)
24
+ unless rule.is_a?(Rule)
25
+ raise ArgumentError,
26
+ "Expected Rule, got #{rule.class}"
27
+ end
28
+
29
+ @rules << rule
30
+ @rules.sort_by! { |r| -r.priority }
31
+ self
32
+ end
33
+
34
+ # Find the first rule that matches the element
35
+ #
36
+ # @param element [Object] OOXML element to find a rule for
37
+ # @return [Rule] matching rule
38
+ # @raise [ArgumentError] if no rule matches
39
+ def find_rule(element)
40
+ @rules.find { |r| r.matches?(element) } ||
41
+ raise(ArgumentError, "No transform rule registered for #{element.class}")
42
+ end
43
+
44
+ # Check if any rule matches the element
45
+ #
46
+ # @param element [Object] element to check
47
+ # @return [Boolean]
48
+ def matches?(element)
49
+ @rules.any? { |r| r.matches?(element) }
50
+ end
51
+
52
+ # Number of registered rules
53
+ # @return [Integer]
54
+ def size
55
+ @rules.size
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:bookmarkStart to a metadata hash for attaching to
8
+ # the next CoreModel element.
9
+ #
10
+ # Bookmarks in OOXML are position markers (not content containers).
11
+ # The orchestrator collects bookmark IDs and attaches them as
12
+ # element attributes on the containing paragraph/section.
13
+ class BookmarkRule < Rule
14
+ def matches?(element)
15
+ return false unless defined?(Uniword::Wordprocessingml)
16
+
17
+ element.is_a?(Uniword::Wordprocessingml::BookmarkStart) ||
18
+ element.is_a?(Uniword::Wordprocessingml::BookmarkEnd)
19
+ end
20
+
21
+ # Returns a hash with bookmark metadata, not a CoreModel node.
22
+ # The orchestrator uses this to set the id on the parent element.
23
+ def apply(element, _context)
24
+ if element.is_a?(Uniword::Wordprocessingml::BookmarkStart)
25
+ { id: element.id&.to_s, name: element.name&.to_s }
26
+ else
27
+ nil # BookmarkEnd — no useful data
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:br (Break) elements.
8
+ #
9
+ # Page breaks become CoreModel::Block (page_break).
10
+ # Line breaks become CoreModel::InlineElement (hard_line_break).
11
+ class BreakRule < Rule
12
+ def matches?(element)
13
+ defined?(Uniword::Wordprocessingml::Break) &&
14
+ element.is_a?(Uniword::Wordprocessingml::Break)
15
+ end
16
+
17
+ def apply(brk, _context)
18
+ if brk.type == 'page'
19
+ Coradoc::CoreModel::Block.new(element_type: 'page_break')
20
+ else
21
+ Coradoc::CoreModel::InlineElement.new(
22
+ format_type: 'hard_line_break'
23
+ )
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:footnoteReference to CoreModel::FootnoteReference.
8
+ #
9
+ # Footnote content is looked up from the context's footnotes map,
10
+ # which is populated by the ToCoreModel orchestrator before
11
+ # transforming body elements.
12
+ class FootnoteRule < Rule
13
+ def matches?(element)
14
+ defined?(Uniword::Wordprocessingml::FootnoteReference) &&
15
+ element.is_a?(Uniword::Wordprocessingml::FootnoteReference)
16
+ end
17
+
18
+ def apply(ref, _context)
19
+ id = ref.id&.to_s
20
+
21
+ CoreModel::FootnoteReference.new(id: id)
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms heading paragraphs to CoreModel::StructuralElement.
8
+ #
9
+ # Heading detection uses StyleResolver which checks pStyle values
10
+ # (like "Heading1", "heading 2") and outline levels.
11
+ #
12
+ # This rule is NOT registered in the RuleRegistry — instead, the
13
+ # ToCoreModel orchestrator dispatches to it directly after checking
14
+ # the style resolver. This avoids the problem of matches() needing
15
+ # context to determine if a paragraph is a heading.
16
+ class HeadingRule < Rule
17
+ include OrderedContent
18
+
19
+ def matches?(_element)
20
+ false # Never auto-matched; orchestrator dispatches directly
21
+ end
22
+
23
+ def apply(paragraph, context)
24
+ level = context.style_resolver.heading_level(paragraph) || 1
25
+ title = extract_title(paragraph, context)
26
+ id = extract_bookmark_id(paragraph)
27
+
28
+ CoreModel::StructuralElement.new(
29
+ element_type: 'section',
30
+ level: level,
31
+ title: title,
32
+ id: id
33
+ )
34
+ end
35
+
36
+ private
37
+
38
+ def extract_title(paragraph, context)
39
+ children = transform_paragraph_content(paragraph, context)
40
+ extract_plain_text(children)
41
+ end
42
+
43
+ def extract_bookmark_id(paragraph)
44
+ starts = paragraph.bookmark_starts
45
+ return nil if starts.nil? || starts.empty?
46
+
47
+ starts.first.id&.to_s
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:hyperlink elements to CoreModel::InlineElement (link).
8
+ #
9
+ # External hyperlinks have r:id (URL). Internal links have w:anchor
10
+ # (bookmark reference). Both are captured in the target attribute.
11
+ class HyperlinkRule < Rule
12
+ def matches?(element)
13
+ defined?(Uniword::Wordprocessingml::Hyperlink) &&
14
+ element.is_a?(Uniword::Wordprocessingml::Hyperlink)
15
+ end
16
+
17
+ def apply(hyperlink, context)
18
+ content = extract_content(hyperlink, context)
19
+ text = flatten_to_string(content)
20
+
21
+ Coradoc::CoreModel::InlineElement.new(
22
+ format_type: 'link',
23
+ target: resolve_target(hyperlink),
24
+ content: text
25
+ )
26
+ end
27
+
28
+ private
29
+
30
+ def resolve_target(hyperlink)
31
+ # External link (URL stored in r:id)
32
+ return hyperlink.id if hyperlink.id && !hyperlink.id.empty?
33
+
34
+ # Internal link (bookmark anchor)
35
+ "##{hyperlink.anchor}" if hyperlink.anchor
36
+ end
37
+
38
+ def extract_content(hyperlink, context)
39
+ return [] if hyperlink.runs.nil?
40
+
41
+ hyperlink.runs.map { |r| context.transform(r) }.compact
42
+ end
43
+
44
+ def flatten_to_string(content)
45
+ case content
46
+ when Array
47
+ content.map { |c| c.is_a?(String) ? c : c.to_s }.join
48
+ when String
49
+ content
50
+ else
51
+ content.to_s
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms w:drawing and w:pict elements to CoreModel::Image.
8
+ #
9
+ # Extracts image reference data (relationship ID, dimensions, alt text).
10
+ # Binary data extraction is handled by the caller via the image_refs
11
+ # list in Context.
12
+ class ImageRule < Rule
13
+ def matches?(element)
14
+ return false unless defined?(Uniword::Wordprocessingml)
15
+
16
+ element.is_a?(Uniword::Wordprocessingml::Drawing) ||
17
+ element.is_a?(Uniword::Wordprocessingml::Picture)
18
+ end
19
+
20
+ def apply(element, context)
21
+ ref = extract_reference(element)
22
+ context.register_image(ref)
23
+
24
+ CoreModel::Image.new(
25
+ src: ref[:src],
26
+ alt: ref[:alt],
27
+ width: ref[:width],
28
+ height: ref[:height],
29
+ inline: ref[:inline]
30
+ )
31
+ end
32
+
33
+ private
34
+
35
+ def extract_reference(element)
36
+ case element
37
+ when Uniword::Wordprocessingml::Drawing
38
+ extract_drawing_ref(element)
39
+ when Uniword::Wordprocessingml::Picture
40
+ extract_picture_ref(element)
41
+ else
42
+ { src: nil, alt: nil, width: nil, height: nil, inline: true }
43
+ end
44
+ end
45
+
46
+ def extract_drawing_ref(drawing)
47
+ if drawing.inline
48
+ extract_inline_ref(drawing.inline)
49
+ elsif drawing.anchor
50
+ extract_anchor_ref(drawing.anchor)
51
+ else
52
+ { src: nil, alt: nil, width: nil, height: nil, inline: true }
53
+ end
54
+ end
55
+
56
+ def extract_inline_ref(inline)
57
+ extent = inline.extent
58
+ doc_pr = inline.doc_properties
59
+ graphic = inline.graphic
60
+
61
+ {
62
+ src: extract_embed_ref(graphic),
63
+ alt: doc_pr&.name&.to_s || doc_pr&.id&.to_s,
64
+ width: extent_to_px(extent, :cx),
65
+ height: extent_to_px(extent, :cy),
66
+ inline: true
67
+ }
68
+ end
69
+
70
+ def extract_anchor_ref(anchor)
71
+ extent = anchor.extent
72
+ doc_pr = anchor.doc_properties
73
+ graphic = anchor.graphic
74
+
75
+ {
76
+ src: extract_embed_ref(graphic),
77
+ alt: doc_pr&.name&.to_s || doc_pr&.id&.to_s,
78
+ width: extent_to_px(extent, :cx),
79
+ height: extent_to_px(extent, :cy),
80
+ inline: false
81
+ }
82
+ end
83
+
84
+ def extract_picture_ref(_pict)
85
+ # VML-based pictures — less common, extract basic info
86
+ { src: nil, alt: nil, width: nil, height: nil, inline: true }
87
+ end
88
+
89
+ def extract_embed_ref(graphic)
90
+ return nil unless graphic
91
+
92
+ graphic_data = graphic.graphic_data
93
+ return nil unless graphic_data
94
+
95
+ # Navigate: GraphicData → Picture → BlipFill → Blip → embed
96
+ picture = graphic_data.picture
97
+ return nil unless picture
98
+
99
+ blip_fill = picture.blip_fill
100
+ return nil unless blip_fill
101
+
102
+ blip = blip_fill.blip
103
+ blip&.embed
104
+ end
105
+
106
+ # OOXML uses EMU (English Metric Units): 1 inch = 914400 EMU
107
+ EMU_PER_PX = 9525
108
+
109
+ def extent_to_px(extent, dimension)
110
+ return nil unless extent
111
+
112
+ value = case dimension
113
+ when :cx then extent.cx
114
+ when :cy then extent.cy
115
+ end
116
+ return nil unless value
117
+
118
+ px = value.to_i / EMU_PER_PX
119
+ px.positive? ? "#{px}px" : nil
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms list-item paragraphs to CoreModel::ListItem.
8
+ #
9
+ # Each paragraph with numPr (numbering properties) becomes a ListItem.
10
+ # The ToCoreModel orchestrator groups consecutive items with the same
11
+ # numId into a single ListBlock.
12
+ #
13
+ # Children are stored as InlineElement objects (via transform_paragraph_content)
14
+ # while content is the plain text representation.
15
+ #
16
+ # This rule is NOT registered in the RuleRegistry — the orchestrator
17
+ # dispatches directly after checking style_resolver.list_item?.
18
+ class ListItemRule < Rule
19
+ include OrderedContent
20
+
21
+ def matches?(_element)
22
+ false # Never auto-matched; orchestrator dispatches directly
23
+ end
24
+
25
+ def apply(paragraph, context)
26
+ ilvl = paragraph.properties&.ilvl.to_i
27
+
28
+ children = transform_paragraph_content(paragraph, context)
29
+
30
+ item = CoreModel::ListItem.new(
31
+ marker: marker_for(ilvl),
32
+ content: extract_plain_text(children)
33
+ )
34
+ item.children = children
35
+ item
36
+ end
37
+
38
+ private
39
+
40
+ def marker_for(level)
41
+ level.zero? ? '*' : '*' * (level + 1)
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Docx
5
+ module Transform
6
+ module Rules
7
+ # Transforms OMML math elements to CoreModel.
8
+ #
9
+ # Display math (m:oMathPara) → CoreModel::Block (stem)
10
+ # Inline math (m:oMath) → CoreModel::InlineElement (stem)
11
+ #
12
+ # Uses Plurimath for OMML → LaTeX conversion when available.
13
+ # Falls back to raw XML string when Plurimath is not loaded.
14
+ class MathRule < Rule
15
+ def matches?(element)
16
+ return false unless defined?(Uniword::Math)
17
+
18
+ element.is_a?(Uniword::Math::OMathPara) ||
19
+ element.is_a?(Uniword::Math::OMath)
20
+ end
21
+
22
+ def apply(element, _context)
23
+ latex = omml_to_latex(element)
24
+
25
+ if display_math?(element)
26
+ CoreModel::Block.new(
27
+ element_type: 'block',
28
+ delimiter_type: '++++',
29
+ language: 'latexmath',
30
+ content: latex
31
+ )
32
+ else
33
+ CoreModel::InlineElement.new(
34
+ format_type: 'stem',
35
+ content: latex
36
+ )
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def display_math?(element)
43
+ defined?(Uniword::Math::OMathPara) &&
44
+ element.is_a?(Uniword::Math::OMathPara)
45
+ end
46
+
47
+ def omml_to_latex(element)
48
+ if defined?(Plurimath)
49
+ plurimath_to_latex(element)
50
+ else
51
+ # Fallback: serialize to XML string
52
+ element_respond_to_xml(element) || ''
53
+ end
54
+ end
55
+
56
+ def plurimath_to_latex(element)
57
+ xml = element_to_xml(element)
58
+ return '' if xml.nil? || xml.empty?
59
+
60
+ begin
61
+ formula = Plurimath::OMML.parse(xml)
62
+ formula.to_latex
63
+ rescue StandardError
64
+ ''
65
+ end
66
+ end
67
+
68
+ def element_to_xml(element)
69
+ return '' unless element.is_a?(Uniword::Wordprocessingml::AlternateContent) ||
70
+ element.is_a?(Uniword::Wordprocessingml::OMath)
71
+
72
+ element.to_xml
73
+ end
74
+
75
+ def element_respond_to_xml(element)
76
+ element_to_xml(element)
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end