RubyGems - coradoc-docx - Versions diffs - 0.1.0 - Mend

coradoc-docx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +7 -0
data/README.adoc +164 -0
data/lib/coradoc/docx/transform/context.rb +72 -0
data/lib/coradoc/docx/transform/from_core_model.rb +577 -0
data/lib/coradoc/docx/transform/numbering_resolver.rb +127 -0
data/lib/coradoc/docx/transform/ordered_content.rb +95 -0
data/lib/coradoc/docx/transform/rule.rb +57 -0
data/lib/coradoc/docx/transform/rule_registry.rb +60 -0
data/lib/coradoc/docx/transform/rules/bookmark_rule.rb +34 -0
data/lib/coradoc/docx/transform/rules/break_rule.rb +30 -0
data/lib/coradoc/docx/transform/rules/footnote_rule.rb +27 -0
data/lib/coradoc/docx/transform/rules/heading_rule.rb +53 -0
data/lib/coradoc/docx/transform/rules/hyperlink_rule.rb +58 -0
data/lib/coradoc/docx/transform/rules/image_rule.rb +125 -0
data/lib/coradoc/docx/transform/rules/list_item_rule.rb +47 -0
data/lib/coradoc/docx/transform/rules/math_rule.rb +82 -0
data/lib/coradoc/docx/transform/rules/paragraph_rule.rb +65 -0
data/lib/coradoc/docx/transform/rules/proof_error_rule.rb +25 -0
data/lib/coradoc/docx/transform/rules/run_rule.rb +189 -0
data/lib/coradoc/docx/transform/rules/simple_field_rule.rb +87 -0
data/lib/coradoc/docx/transform/rules/structured_document_tag_rule.rb +36 -0
data/lib/coradoc/docx/transform/rules/table_rule.rb +85 -0
data/lib/coradoc/docx/transform/rules/text_rule.rb +25 -0
data/lib/coradoc/docx/transform/style_resolver.rb +249 -0
data/lib/coradoc/docx/transform/to_core_model.rb +340 -0
data/lib/coradoc/docx/transform.rb +38 -0
data/lib/coradoc/docx/version.rb +7 -0
data/lib/coradoc/docx.rb +99 -0
metadata +155 -0

data/lib/coradoc/docx/transform/rules/paragraph_rule.rb ADDED Viewed

@@ -0,0 +1,65 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms regular paragraphs to CoreModel::Block.
+        #
+        # Handles paragraph-style detection for block types:
+        # quote, source, literal, example, or plain paragraph.
+        #
+        # This is the default rule for paragraphs. The orchestrator dispatches
+        # heading and list item paragraphs directly, so this rule only sees
+        # regular paragraphs.
+        class ParagraphRule < Rule
+          include OrderedContent
+          def priority
+            0
+          end
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::Paragraph) &&
+              element.is_a?(Uniword::Wordprocessingml::Paragraph)
+          end
+          def apply(paragraph, context)
+            role = context.style_resolver.role_from_style(paragraph)
+            block_type = block_type_for(role)
+            children = transform_paragraph_content(paragraph, context)
+            id = extract_bookmark_id(paragraph)
+            block = CoreModel::Block.new(
+              element_type: block_type,
+              content: extract_plain_text(children)
+            )
+            block.children = children
+            block.id = id if id
+            block
+          end
+          private
+          def block_type_for(role)
+            case role
+            when :quote then 'quote'
+            when :source then 'source'
+            when :literal then 'literal'
+            when :example then 'example'
+            else 'paragraph'
+            end
+          end
+          def extract_bookmark_id(paragraph)
+            starts = paragraph.bookmark_starts
+            return nil if starts.nil? || starts.empty?
+            starts.first.id&.to_s
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/proof_error_rule.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Silently ignores w:proofErr (proofing error) elements.
+        #
+        # Proofing errors are spelling/grammar markers in OOXML that have
+        # no semantic representation in CoreModel. This rule matches them
+        # and returns nil, effectively stripping them from the output.
+        class ProofErrorRule < Rule
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::ProofError) &&
+              element.is_a?(Uniword::Wordprocessingml::ProofError)
+          end
+          def apply(_element, _context)
+            nil
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/run_rule.rb ADDED Viewed

@@ -0,0 +1,189 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:r (Run) elements to InlineElement or String.
+        #
+        # Runs with formatting become CoreModel::InlineElement nodes.
+        # Plain runs (no formatting properties) return their text directly.
+        #
+        # A single run may carry multiple formatting properties (e.g., bold +
+        # italic). The most specific one wins for format_type, while the
+        # text content is preserved.
+        #
+        # Uses effective_run_properties (when available) to resolve style
+        # inheritance: explicit properties > paragraph style's rPr > basedOn chain.
+        # Falls back to run.properties for backward compatibility.
+        class RunRule < Rule
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::Run) &&
+              element.is_a?(Uniword::Wordprocessingml::Run)
+          end
+          def apply(run, context)
+            # Delegate non-text children (breaks, drawings, footnotes, etc.)
+            non_text = extract_non_text_children(run, context)
+            return non_text.first if non_text.any? && run.text.nil?
+            text = run.text&.content.to_s
+            return '' if text.empty? && non_text.empty?
+            props = effective_props(run)
+            return text if plain_run?(props)
+            fmt = format_type(props, run, context)
+            return text unless fmt
+            CoreModel::InlineElement.new(
+              format_type: fmt,
+              content: text
+            )
+          end
+          private
+          def effective_props(run)
+            ep = run.effective_run_properties
+            return ep if ep
+            run.properties
+          end
+          def extract_non_text_children(run, context)
+            result = []
+            result << context.transform(run.break) if run.break
+            result << context.transform(run.footnote_reference) if run.footnote_reference
+            result << context.transform(run.endnote_reference) if run.endnote_reference
+            run.drawings&.each do |drawing|
+              result << context.transform(drawing)
+            end
+            result << "\t" if run.tab
+            result << context.transform(run.o_math) if run.class.attributes.key?(:o_math) && run.o_math
+            if run.del_text
+              text = run.del_text.is_a?(Uniword::Wordprocessingml::DeletedText) ? run.del_text.content.to_s : run.del_text.to_s
+              unless text.empty?
+                result << CoreModel::InlineElement.new(
+                  format_type: 'strikethrough',
+                  content: text
+                )
+              end
+            end
+            if run.sym
+              char = run.sym.char
+              result << char.to_s if char && !char.empty?
+            end
+            result << "\u2011" if run.no_break_hyphen
+            result << "\u00AD" if run.class.attributes.key?(:soft_hyphen) && run.soft_hyphen
+            result << CoreModel::InlineElement.new(format_type: 'hard_line_break') if run.class.attributes.key?(:carriage_return) && run.carriage_return
+            if run.alternate_content
+              result << extract_alternate_content(run.alternate_content,
+                                                  context)
+            end
+            result.compact
+          end
+          def extract_alternate_content(ac, context)
+            content = if ac.fallback
+                        ac.fallback
+                      elsif ac.choice
+                        ac.choice
+                      end
+            return nil unless content
+            if content.is_a?(Uniword::Wordprocessingml::Run)
+              content.runs&.each { |r| context.transform(r) }
+            elsif content.is_a?(Uniword::Wordprocessingml::Paragraph)
+              content.paragraphs&.flat_map { |p| context.transform(p) }
+            end
+          end
+          def plain_run?(props)
+            return true unless props
+            props.bold.nil? &&
+              props.italic.nil? &&
+              props.underline.nil? &&
+              props.strike.nil? &&
+              props.double_strike.nil? &&
+              props.vertical_align.nil? &&
+              props.small_caps.nil? &&
+              props.caps.nil? &&
+              props.hidden.nil? &&
+              props.highlight.nil?
+          end
+          # Determine the dominant format type.
+          # Checks rStyle-based semantic detection first, then explicit formatting.
+          def format_type(props, run, context)
+            return nil unless props
+            # Check rStyle for semantic role
+            if context.style_resolver.is_a?(Coradoc::Docx::Transform::StyleResolver)
+              role = context.style_resolver.run_semantic_role(run)
+              case role
+              when :monospace then return 'monospace'
+              when :bold then return 'bold'
+              when :italic then return 'italic'
+              end
+            end
+            # Explicit formatting properties
+            if bold?(props)
+              'bold'
+            elsif italic?(props)
+              'italic'
+            elsif props.underline
+              'underline'
+            elsif props.strike || props.double_strike
+              'strikethrough'
+            elsif subscript?(props)
+              'subscript'
+            elsif superscript?(props)
+              'superscript'
+            elsif props.small_caps
+              'small'
+            elsif props.caps
+              'bold'
+            elsif props.highlight
+              'highlight'
+            elsif props.hidden
+              nil
+            end
+          end
+          def bold?(props)
+            props.bold && props.bold.value != false
+          end
+          def italic?(props)
+            props.italic && props.italic.value != false
+          end
+          def subscript?(props)
+            props.vertical_align&.value.to_s == 'subscript'
+          end
+          def superscript?(props)
+            props.vertical_align&.value.to_s == 'superscript'
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/simple_field_rule.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:fldSimple (simple field) elements.
+        #
+        # Simple fields include page numbers, dates, document properties,
+        # and other computed content. This rule extracts the field's text
+        # content when available, otherwise produces the instruction text.
+        #
+        # Common field types:
+        # - PAGE → current page number
+        # - NUMPAGES → total page count
+        # - DATE → current date
+        # - TIME → current time
+        # - DOCPROPERTY → document property value
+        # - TITLE → document title
+        # - AUTHOR → document author
+        class SimpleFieldRule < Rule
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::SimpleField) &&
+              element.is_a?(Uniword::Wordprocessingml::SimpleField)
+          end
+          def apply(field, _context)
+            # Try to get the resolved text content first
+            text = field_text(field)
+            return nil if text.nil? || text.empty?
+            # Check if this is a semantic field we should preserve
+            instr = field_instruction(field)
+            case instr
+            when /\A(TITLE|AUTHOR|SUBJECT|KEYWORDS|DOCPROPERTY)\b/i
+              # Document metadata — embed as plain text (already resolved)
+              text
+            when /\A(PAGE|NUMPAGES)\b/i
+              # Page layout fields — skip (not semantic)
+              nil
+            when /\A(HYPERLINK)\b/i
+              # Hyperlink field — extract URL and text
+              url = extract_hyperlink_url(instr)
+              if url
+                CoreModel::InlineElement.new(
+                  format_type: 'link',
+                  content: text,
+                  target: url
+                )
+              else
+                text
+              end
+            when /\A(TOC|PAGEREF|REF|NOTEREF)\b/i
+              # TOC / cross-reference fields — skip (print layout)
+              nil
+            else
+              # Generic field — pass through as text
+              text
+            end
+          end
+          private
+          def field_text(field)
+            if field.runs && !field.runs.empty?
+              return field.runs.map do |r|
+                r.text&.content.to_s
+              end.join
+            end
+            nil
+          end
+          def field_instruction(field)
+            instr = field.instr
+            instr.to_s
+          end
+          def extract_hyperlink_url(instr)
+            match = instr.match(/HYPERLINK\s+"([^"]+)"/i)
+            match&.[](1)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/structured_document_tag_rule.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:sdt (Structured Document Tag) elements.
+        #
+        # SDTs wrap content with additional metadata. The transform
+        # unwraps them and delegates to the content's own rules.
+        class StructuredDocumentTagRule < Rule
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::StructuredDocumentTag) &&
+              element.is_a?(Uniword::Wordprocessingml::StructuredDocumentTag)
+          end
+          def apply(sdt, context)
+            # SDTs contain paragraphs and tables — delegate to their rules
+            # via the context's transform method
+            return nil unless sdt.content
+            paragraphs = sdt.content.paragraphs || []
+            tables = sdt.content.tables || []
+            results = []
+            paragraphs.each { |p| results << context.transform(p) }
+            tables.each { |t| results << context.transform(t) }
+            # Return single element or array
+            results.one? ? results.first : results.compact
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/table_rule.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:tbl (Table) elements to CoreModel::Table.
+        #
+        # Walks the OOXML table structure (Table → TableRow → TableCell)
+        # and produces the corresponding CoreModel tree.
+        #
+        # Cell paragraphs are transformed through the rule system to preserve
+        # inline formatting (bold, italic, links) as InlineElement objects.
+        # Print-layout properties (frame, grid, width) are NOT mapped — CoreModel
+        # is a semantic model, not a print layout language.
+        class TableRule < Rule
+          include OrderedContent
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::Table) &&
+              element.is_a?(Uniword::Wordprocessingml::Table)
+          end
+          def apply(table, context)
+            CoreModel::Table.new(
+              rows: table.rows.map { |r| transform_row(r, context) }
+            )
+          end
+          private
+          def transform_row(row, context)
+            CoreModel::TableRow.new(
+              cells: row.cells.map { |c| transform_cell(c, context) },
+              header: row.header?
+            )
+          end
+          def transform_cell(cell, context)
+            inline_children = cell_paragraphs(cell).flat_map do |para|
+              extract_inline_from_paragraph(para, context)
+            end
+            props = cell.properties
+            CoreModel::TableCell.new(
+              content: extract_plain_text(inline_children),
+              alignment: props&.vertical_align&.to_s,
+              colspan: cell.column_span,
+              rowspan: cell.row_span,
+              header: header_cell?(cell),
+              children: inline_children
+            )
+          end
+          # Transform a cell paragraph and extract its inline children
+          def extract_inline_from_paragraph(para, context)
+            transformed = context.transform(para)
+            return [] unless transformed
+            # If it's a Block with children (inline elements), extract them
+            if transformed.is_a?(CoreModel::Block) && transformed.children.any?
+              transformed.children
+            elsif transformed.is_a?(CoreModel::Block)
+              [transformed.content].compact
+            else
+              [transformed]
+            end
+          end
+          def cell_paragraphs(cell)
+            cell.paragraphs || []
+          end
+          def header_cell?(cell)
+            return false unless cell.properties
+            vm = cell.properties.v_merge
+            vm&.value.to_s == 'restart'
+          end
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/rules/text_rule.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      module Rules
+        # Transforms w:t (Text) elements to plain strings.
+        #
+        # Text is returned as a raw string — not wrapped in a CoreModel node.
+        # The caller (RunRule) is responsible for wrapping in InlineElement
+        # when formatting is present.
+        class TextRule < Rule
+          def matches?(element)
+            defined?(Uniword::Wordprocessingml::Text) &&
+              element.is_a?(Uniword::Wordprocessingml::Text)
+          end
+          def apply(text, _context)
+            text.content.to_s
+          end
+        end
+      end
+    end
+  end
+end