RubyGems - coradoc-docx - Versions diffs - 0.1.0 - Mend

coradoc-docx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +7 -0
data/README.adoc +164 -0
data/lib/coradoc/docx/transform/context.rb +72 -0
data/lib/coradoc/docx/transform/from_core_model.rb +577 -0
data/lib/coradoc/docx/transform/numbering_resolver.rb +127 -0
data/lib/coradoc/docx/transform/ordered_content.rb +95 -0
data/lib/coradoc/docx/transform/rule.rb +57 -0
data/lib/coradoc/docx/transform/rule_registry.rb +60 -0
data/lib/coradoc/docx/transform/rules/bookmark_rule.rb +34 -0
data/lib/coradoc/docx/transform/rules/break_rule.rb +30 -0
data/lib/coradoc/docx/transform/rules/footnote_rule.rb +27 -0
data/lib/coradoc/docx/transform/rules/heading_rule.rb +53 -0
data/lib/coradoc/docx/transform/rules/hyperlink_rule.rb +58 -0
data/lib/coradoc/docx/transform/rules/image_rule.rb +125 -0
data/lib/coradoc/docx/transform/rules/list_item_rule.rb +47 -0
data/lib/coradoc/docx/transform/rules/math_rule.rb +82 -0
data/lib/coradoc/docx/transform/rules/paragraph_rule.rb +65 -0
data/lib/coradoc/docx/transform/rules/proof_error_rule.rb +25 -0
data/lib/coradoc/docx/transform/rules/run_rule.rb +189 -0
data/lib/coradoc/docx/transform/rules/simple_field_rule.rb +87 -0
data/lib/coradoc/docx/transform/rules/structured_document_tag_rule.rb +36 -0
data/lib/coradoc/docx/transform/rules/table_rule.rb +85 -0
data/lib/coradoc/docx/transform/rules/text_rule.rb +25 -0
data/lib/coradoc/docx/transform/style_resolver.rb +249 -0
data/lib/coradoc/docx/transform/to_core_model.rb +340 -0
data/lib/coradoc/docx/transform.rb +38 -0
data/lib/coradoc/docx/version.rb +7 -0
data/lib/coradoc/docx.rb +99 -0
metadata +155 -0

data/lib/coradoc/docx/transform/ordered_content.rb ADDED Viewed

@@ -0,0 +1,95 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      # Utility for iterating paragraph content in document order.
+      #
+      # OOXML paragraphs have separate arrays for runs, hyperlinks, SDTs, etc.
+      # but `element_order` (from lutaml-model mixed_content) preserves the
+      # interleaved sequence. This module provides a single method to walk
+      # paragraph content in the correct order.
+      #
+      # Used by ParagraphRule, ListItemRule, and HeadingRule.
+      module OrderedContent
+        # Iterate paragraph inline content in document order.
+        #
+        # @param paragraph [Uniword::Wordprocessingml::Paragraph]
+        # @param context [Context] transform context with registry
+        # @return [Array] transformed content (Strings, InlineElements, etc.)
+        def transform_paragraph_content(paragraph, context)
+          order = paragraph.is_a?(Uniword::Wordprocessingml::Paragraph) ? paragraph.element_order : nil
+          if order && !order.empty?
+            transform_ordered(paragraph, order, context)
+          else
+            transform_sequential(paragraph, context)
+          end
+        end
+        # Flatten children array to plain text string.
+        #
+        # @param children [Array] mixed content (Strings, InlineElements, Blocks)
+        # @return [String]
+        def extract_plain_text(children)
+          children.map do |c|
+            case c
+            when String then c
+            when CoreModel::InlineElement then c.content.to_s
+            when CoreModel::Block then c.content.to_s
+            else c.to_s
+            end
+          end.join
+        end
+        private
+        def transform_ordered(paragraph, order, context)
+          counters = Hash.new(0)
+          result = []
+          order.each do |entry|
+            name = entry.is_a?(String) ? entry : entry.name
+            idx = counters[name]
+            counters[name] = idx + 1
+            item = case name
+                   when 'r'
+                     run = paragraph.runs[idx]
+                     context.transform(run) if run
+                   when 'hyperlink'
+                     hl = paragraph.hyperlinks[idx]
+                     context.transform(hl) if hl
+                   when 'sdt'
+                     sdt = paragraph.structured_document_tags&.[](idx)
+                     context.transform(sdt) if sdt
+                   when 'oMathPara'
+                     math = paragraph.o_math_paras&.[](idx)
+                     context.transform(math) if math
+                   when 'oMath'
+                     math = paragraph.o_maths&.[](idx)
+                     context.transform(math) if math
+                   when 'fldSimple'
+                     field = paragraph.simple_fields&.[](idx)
+                     context.transform(field) if field
+                   when 'proofErr'
+                     # Proofing errors have no semantic value — skip silently
+                     nil
+                   end
+            result << item if item
+          end
+          result.compact
+        end
+        def transform_sequential(paragraph, context)
+          content = []
+          (paragraph.runs || []).each { |r| content << context.transform(r) }
+          (paragraph.hyperlinks || []).each { |h| content << context.transform(h) }
+          content.compact
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rule.rb ADDED Viewed

@@ -0,0 +1,57 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      # Base class for OOXML → CoreModel transform rules.
+      #
+      # Each rule handles one OOXML element type and produces a CoreModel
+      # node. Rules are registered in RuleRegistry and dispatched by the
+      # ToCoreModel orchestrator.
+      #
+      # Subclasses must implement:
+      #   - matches?(element) → true if this rule handles the element
+      #   - apply(element, context) → CoreModel node or Array of nodes
+      #
+      # @example Implementing a custom rule
+      #   class MyRule < Rule
+      #     def matches?(element)
+      #       element.is_a?(Uniword::Wordprocessingml::MyElement)
+      #     end
+      #
+      #     def apply(element, context)
+      #       Coradoc::CoreModel::Block.new(
+      #         element_type: 'paragraph',
+      #         content: element.text
+      #       )
+      #     end
+      #   end
+      class Rule
+        # Check if this rule handles the given element
+        #
+        # @param element [Object] OOXML element to check
+        # @return [Boolean]
+        def matches?(element)
+          raise NotImplementedError, "#{self.class}#matches? not implemented"
+        end
+        # Transform an OOXML element to a CoreModel node
+        #
+        # @param element [Object] OOXML element to transform
+        # @param context [Context] shared transform context
+        # @return [Coradoc::CoreModel::Base, Array, String, nil]
+        def apply(element, context)
+          raise NotImplementedError, "#{self.class}#apply not implemented"
+        end
+        # Rule priority — higher priority rules are checked first.
+        # Override in subclasses when needed.
+        #
+        # @return [Integer]
+        def priority
+          0
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rule_registry.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      # Registry for transform rules.
+      #
+      # Manages registration and lookup of rules for OOXML element types.
+      # Rules are checked in priority order (highest first).
+      # Falls back to NullRule which raises ArgumentError.
+      #
+      # Follows Open/Closed Principle: new rules are added by registering,
+      # not by modifying the registry class.
+      class RuleRegistry
+        def initialize
+          @rules = []
+        end
+        # Register a rule instance
+        #
+        # @param rule [Rule] the rule to register
+        # @return [self]
+        def register(rule)
+          unless rule.is_a?(Rule)
+            raise ArgumentError,
+                  "Expected Rule, got #{rule.class}"
+          end
+          @rules << rule
+          @rules.sort_by! { |r| -r.priority }
+          self
+        end
+        # Find the first rule that matches the element
+        #
+        # @param element [Object] OOXML element to find a rule for
+        # @return [Rule] matching rule
+        # @raise [ArgumentError] if no rule matches
+        def find_rule(element)
+          @rules.find { |r| r.matches?(element) } ||
+            raise(ArgumentError, "No transform rule registered for #{element.class}")
+        end
+        # Check if any rule matches the element
+        #
+        # @param element [Object] element to check
+        # @return [Boolean]
+        def matches?(element)
+          @rules.any? { |r| r.matches?(element) }
+        end
+        # Number of registered rules
+        # @return [Integer]
+        def size
+          @rules.size
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/bookmark_rule.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:bookmarkStart to a metadata hash for attaching to
+        # the next CoreModel element.
+        #
+        # Bookmarks in OOXML are position markers (not content containers).
+        # The orchestrator collects bookmark IDs and attaches them as
+        # element attributes on the containing paragraph/section.
+        class BookmarkRule < Rule
+          def matches?(element)
+            return false unless defined?(Uniword::Wordprocessingml)
+            element.is_a?(Uniword::Wordprocessingml::BookmarkStart) ||
+              element.is_a?(Uniword::Wordprocessingml::BookmarkEnd)
+          end
+          # Returns a hash with bookmark metadata, not a CoreModel node.
+          # The orchestrator uses this to set the id on the parent element.
+          def apply(element, _context)
+            if element.is_a?(Uniword::Wordprocessingml::BookmarkStart)
+              { id: element.id&.to_s, name: element.name&.to_s }
+            else
+              nil # BookmarkEnd — no useful data
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/break_rule.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:br (Break) elements.
+        #
+        # Page breaks become CoreModel::Block (page_break).
+        # Line breaks become CoreModel::InlineElement (hard_line_break).
+        class BreakRule < Rule
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::Break) &&
+              element.is_a?(Uniword::Wordprocessingml::Break)
+          end
+          def apply(brk, _context)
+            if brk.type == 'page'
+              Coradoc::CoreModel::Block.new(element_type: 'page_break')
+            else
+              Coradoc::CoreModel::InlineElement.new(
+                format_type: 'hard_line_break'
+              )
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/footnote_rule.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:footnoteReference to CoreModel::FootnoteReference.
+        #
+        # Footnote content is looked up from the context's footnotes map,
+        # which is populated by the ToCoreModel orchestrator before
+        # transforming body elements.
+        class FootnoteRule < Rule
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::FootnoteReference) &&
+              element.is_a?(Uniword::Wordprocessingml::FootnoteReference)
+          end
+          def apply(ref, _context)
+            id = ref.id&.to_s
+            CoreModel::FootnoteReference.new(id: id)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/heading_rule.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms heading paragraphs to CoreModel::StructuralElement.
+        #
+        # Heading detection uses StyleResolver which checks pStyle values
+        # (like "Heading1", "heading 2") and outline levels.
+        #
+        # This rule is NOT registered in the RuleRegistry — instead, the
+        # ToCoreModel orchestrator dispatches to it directly after checking
+        # the style resolver. This avoids the problem of matches() needing
+        # context to determine if a paragraph is a heading.
+        class HeadingRule < Rule
+          include OrderedContent
+          def matches?(_element)
+            false # Never auto-matched; orchestrator dispatches directly
+          end
+          def apply(paragraph, context)
+            level = context.style_resolver.heading_level(paragraph) || 1
+            title = extract_title(paragraph, context)
+            id = extract_bookmark_id(paragraph)
+            CoreModel::StructuralElement.new(
+              element_type: 'section',
+              level: level,
+              title: title,
+              id: id
+            )
+          end
+          private
+          def extract_title(paragraph, context)
+            children = transform_paragraph_content(paragraph, context)
+            extract_plain_text(children)
+          end
+          def extract_bookmark_id(paragraph)
+            starts = paragraph.bookmark_starts
+            return nil if starts.nil? || starts.empty?
+            starts.first.id&.to_s
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/hyperlink_rule.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:hyperlink elements to CoreModel::InlineElement (link).
+        #
+        # External hyperlinks have r:id (URL). Internal links have w:anchor
+        # (bookmark reference). Both are captured in the target attribute.
+        class HyperlinkRule < Rule
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::Hyperlink) &&
+              element.is_a?(Uniword::Wordprocessingml::Hyperlink)
+          end
+          def apply(hyperlink, context)
+            content = extract_content(hyperlink, context)
+            text = flatten_to_string(content)
+            Coradoc::CoreModel::InlineElement.new(
+              format_type: 'link',
+              target: resolve_target(hyperlink),
+              content: text
+            )
+          end
+          private
+          def resolve_target(hyperlink)
+            # External link (URL stored in r:id)
+            return hyperlink.id if hyperlink.id && !hyperlink.id.empty?
+            # Internal link (bookmark anchor)
+            "##{hyperlink.anchor}" if hyperlink.anchor
+          end
+          def extract_content(hyperlink, context)
+            return [] if hyperlink.runs.nil?
+            hyperlink.runs.map { |r| context.transform(r) }.compact
+          end
+          def flatten_to_string(content)
+            case content
+            when Array
+              content.map { |c| c.is_a?(String) ? c : c.to_s }.join
+            when String
+              content
+            else
+              content.to_s
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/image_rule.rb ADDED Viewed

@@ -0,0 +1,125 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:drawing and w:pict elements to CoreModel::Image.
+        #
+        # Extracts image reference data (relationship ID, dimensions, alt text).
+        # Binary data extraction is handled by the caller via the image_refs
+        # list in Context.
+        class ImageRule < Rule
+          def matches?(element)
+            return false unless defined?(Uniword::Wordprocessingml)
+            element.is_a?(Uniword::Wordprocessingml::Drawing) ||
+              element.is_a?(Uniword::Wordprocessingml::Picture)
+          end
+          def apply(element, context)
+            ref = extract_reference(element)
+            context.register_image(ref)
+            CoreModel::Image.new(
+              src: ref[:src],
+              alt: ref[:alt],
+              width: ref[:width],
+              height: ref[:height],
+              inline: ref[:inline]
+            )
+          end
+          private
+          def extract_reference(element)
+            case element
+            when Uniword::Wordprocessingml::Drawing
+              extract_drawing_ref(element)
+            when Uniword::Wordprocessingml::Picture
+              extract_picture_ref(element)
+            else
+              { src: nil, alt: nil, width: nil, height: nil, inline: true }
+            end
+          end
+          def extract_drawing_ref(drawing)
+            if drawing.inline
+              extract_inline_ref(drawing.inline)
+            elsif drawing.anchor
+              extract_anchor_ref(drawing.anchor)
+            else
+              { src: nil, alt: nil, width: nil, height: nil, inline: true }
+            end
+          end
+          def extract_inline_ref(inline)
+            extent = inline.extent
+            doc_pr = inline.doc_properties
+            graphic = inline.graphic
+            {
+              src: extract_embed_ref(graphic),
+              alt: doc_pr&.name&.to_s || doc_pr&.id&.to_s,
+              width: extent_to_px(extent, :cx),
+              height: extent_to_px(extent, :cy),
+              inline: true
+            }
+          end
+          def extract_anchor_ref(anchor)
+            extent = anchor.extent
+            doc_pr = anchor.doc_properties
+            graphic = anchor.graphic
+            {
+              src: extract_embed_ref(graphic),
+              alt: doc_pr&.name&.to_s || doc_pr&.id&.to_s,
+              width: extent_to_px(extent, :cx),
+              height: extent_to_px(extent, :cy),
+              inline: false
+            }
+          end
+          def extract_picture_ref(_pict)
+            # VML-based pictures — less common, extract basic info
+            { src: nil, alt: nil, width: nil, height: nil, inline: true }
+          end
+          def extract_embed_ref(graphic)
+            return nil unless graphic
+            graphic_data = graphic.graphic_data
+            return nil unless graphic_data
+            # Navigate: GraphicData → Picture → BlipFill → Blip → embed
+            picture = graphic_data.picture
+            return nil unless picture
+            blip_fill = picture.blip_fill
+            return nil unless blip_fill
+            blip = blip_fill.blip
+            blip&.embed
+          end
+          # OOXML uses EMU (English Metric Units): 1 inch = 914400 EMU
+          EMU_PER_PX = 9525
+          def extent_to_px(extent, dimension)
+            return nil unless extent
+            value = case dimension
+                    when :cx then extent.cx
+                    when :cy then extent.cy
+                    end
+            return nil unless value
+            px = value.to_i / EMU_PER_PX
+            px.positive? ? "#{px}px" : nil
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/list_item_rule.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms list-item paragraphs to CoreModel::ListItem.
+        #
+        # Each paragraph with numPr (numbering properties) becomes a ListItem.
+        # The ToCoreModel orchestrator groups consecutive items with the same
+        # numId into a single ListBlock.
+        #
+        # Children are stored as InlineElement objects (via transform_paragraph_content)
+        # while content is the plain text representation.
+        #
+        # This rule is NOT registered in the RuleRegistry — the orchestrator
+        # dispatches directly after checking style_resolver.list_item?.
+        class ListItemRule < Rule
+          include OrderedContent
+          def matches?(_element)
+            false # Never auto-matched; orchestrator dispatches directly
+          end
+          def apply(paragraph, context)
+            ilvl = paragraph.properties&.ilvl.to_i
+            children = transform_paragraph_content(paragraph, context)
+            item = CoreModel::ListItem.new(
+              marker: marker_for(ilvl),
+              content: extract_plain_text(children)
+            )
+            item.children = children
+            item
+          end
+          private
+          def marker_for(level)
+            level.zero? ? '*' : '*' * (level + 1)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/math_rule.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms OMML math elements to CoreModel.
+        #
+        # Display math (m:oMathPara) → CoreModel::Block (stem)
+        # Inline math (m:oMath) → CoreModel::InlineElement (stem)
+        #
+        # Uses Plurimath for OMML → LaTeX conversion when available.
+        # Falls back to raw XML string when Plurimath is not loaded.
+        class MathRule < Rule
+          def matches?(element)
+            return false unless defined?(Uniword::Math)
+            element.is_a?(Uniword::Math::OMathPara) ||
+              element.is_a?(Uniword::Math::OMath)
+          end
+          def apply(element, _context)
+            latex = omml_to_latex(element)
+            if display_math?(element)
+              CoreModel::Block.new(
+                element_type: 'block',
+                delimiter_type: '++++',
+                language: 'latexmath',
+                content: latex
+              )
+            else
+              CoreModel::InlineElement.new(
+                format_type: 'stem',
+                content: latex
+              )
+            end
+          end
+          private
+          def display_math?(element)
+            defined?(Uniword::Math::OMathPara) &&
+              element.is_a?(Uniword::Math::OMathPara)
+          end
+          def omml_to_latex(element)
+            if defined?(Plurimath)
+              plurimath_to_latex(element)
+            else
+              # Fallback: serialize to XML string
+              element_respond_to_xml(element) || ''
+            end
+          end
+          def plurimath_to_latex(element)
+            xml = element_to_xml(element)
+            return '' if xml.nil? || xml.empty?
+            begin
+              formula = Plurimath::OMML.parse(xml)
+              formula.to_latex
+            rescue StandardError
+              ''
+            end
+          end
+          def element_to_xml(element)
+            return '' unless element.is_a?(Uniword::Wordprocessingml::AlternateContent) ||
+                             element.is_a?(Uniword::Wordprocessingml::OMath)
+            element.to_xml
+          end
+          def element_respond_to_xml(element)
+            element_to_xml(element)
+          end
+        end
+      end
+    end
+  end
+end