RubyGems - coradoc-docx - Versions diffs - 0.1.0 - Mend

coradoc-docx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +7 -0
data/README.adoc +164 -0
data/lib/coradoc/docx/transform/context.rb +72 -0
data/lib/coradoc/docx/transform/from_core_model.rb +577 -0
data/lib/coradoc/docx/transform/numbering_resolver.rb +127 -0
data/lib/coradoc/docx/transform/ordered_content.rb +95 -0
data/lib/coradoc/docx/transform/rule.rb +57 -0
data/lib/coradoc/docx/transform/rule_registry.rb +60 -0
data/lib/coradoc/docx/transform/rules/bookmark_rule.rb +34 -0
data/lib/coradoc/docx/transform/rules/break_rule.rb +30 -0
data/lib/coradoc/docx/transform/rules/footnote_rule.rb +27 -0
data/lib/coradoc/docx/transform/rules/heading_rule.rb +53 -0
data/lib/coradoc/docx/transform/rules/hyperlink_rule.rb +58 -0
data/lib/coradoc/docx/transform/rules/image_rule.rb +125 -0
data/lib/coradoc/docx/transform/rules/list_item_rule.rb +47 -0
data/lib/coradoc/docx/transform/rules/math_rule.rb +82 -0
data/lib/coradoc/docx/transform/rules/paragraph_rule.rb +65 -0
data/lib/coradoc/docx/transform/rules/proof_error_rule.rb +25 -0
data/lib/coradoc/docx/transform/rules/run_rule.rb +189 -0
data/lib/coradoc/docx/transform/rules/simple_field_rule.rb +87 -0
data/lib/coradoc/docx/transform/rules/structured_document_tag_rule.rb +36 -0
data/lib/coradoc/docx/transform/rules/table_rule.rb +85 -0
data/lib/coradoc/docx/transform/rules/text_rule.rb +25 -0
data/lib/coradoc/docx/transform/style_resolver.rb +249 -0
data/lib/coradoc/docx/transform/to_core_model.rb +340 -0
data/lib/coradoc/docx/transform.rb +38 -0
data/lib/coradoc/docx/version.rb +7 -0
data/lib/coradoc/docx.rb +99 -0
metadata +155 -0

data/lib/coradoc/docx/transform/style_resolver.rb ADDED Viewed

@@ -0,0 +1,249 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      # Resolves paragraph and run styles to semantic roles.
+      #
+      # OOXML paragraphs don't have explicit element types. Instead, their
+      # meaning is determined by style references (e.g., "Heading1" → section)
+      # or by formatting properties (e.g., numPr → list item).
+      #
+      # StyleResolver centralizes this detection so HeadingRule, ListItemRule,
+      # and ParagraphRule don't duplicate the logic.
+      #
+      # The style map is built from the Uniword StylesConfiguration by walking
+      # all style definitions and their basedOn chains.
+      class StyleResolver
+        HEADING_PATTERN = /^(heading|heading|h)\s*(\d+)$/i
+        QUOTE_PATTERN = /\bquote\b/i
+        CODE_PATTERN = /\b(code|source|listing)\b/i
+        LITERAL_PATTERN = /\bliteral\b/i
+        EXAMPLE_PATTERN = /\bexample\b/i
+        # @param styles_configuration [Object, nil] Uniword styles configuration
+        def initialize(styles_configuration)
+          @config = styles_configuration
+          @style_map = build_style_map(styles_configuration)
+        end
+        # Determine the semantic role of a paragraph
+        #
+        # @param paragraph [Uniword::Wordprocessingml::Paragraph]
+        # @return [Symbol] :heading, :list_item, :quote, :source, :literal,
+        #   :example, or :paragraph
+        def semantic_role(paragraph)
+          return :heading if heading?(paragraph)
+          return :list_item if list_item?(paragraph)
+          style_role = role_from_style(paragraph)
+          return style_role if style_role
+          :paragraph
+        end
+        # Check if paragraph is a heading
+        # @param paragraph [Uniword::Wordprocessingml::Paragraph]
+        # @return [Boolean]
+        def heading?(paragraph)
+          return false unless paragraph.properties
+          style_name = resolve_style_name(paragraph)
+          return true if style_name && HEADING_PATTERN.match?(style_name)
+          ol = paragraph.properties.outline_level
+          if ol
+            ol_level = ol.is_a?(Uniword::Wordprocessingml::OutlineLevel) ? ol.value.to_i : ol.to_i
+            return true if ol_level.positive?
+          end
+          style = find_style_for_paragraph(paragraph)
+          if style&.outline_level
+            ol_val = style.outline_level
+            ol_val = ol_val.is_a?(Uniword::Wordprocessingml::OutlineLevel) ? ol_val.value.to_i : ol_val.to_i
+            return true if ol_val.positive?
+          end
+          false
+        end
+        # Get heading level (1-6) or nil
+        # @param paragraph [Uniword::Wordprocessingml::Paragraph]
+        # @return [Integer, nil]
+        def heading_level(paragraph)
+          style_name = resolve_style_name(paragraph)
+          if style_name
+            match = HEADING_PATTERN.match(style_name)
+            return match[2].to_i if match
+          end
+          # Check outline_level on paragraph properties
+          ol = paragraph.properties&.outline_level
+          if ol
+            level = ol.is_a?(Uniword::Wordprocessingml::OutlineLevel) ? ol.value.to_i : ol.to_i
+            return level if level.positive?
+          end
+          nil
+        end
+        # Check if paragraph is a list item
+        # @param paragraph [Uniword::Wordprocessingml::Paragraph]
+        # @return [Boolean]
+        def list_item?(paragraph)
+          return false unless paragraph.properties
+          num_id = paragraph.properties.num_id
+          num_id.to_i.positive?
+        end
+        # Check if paragraph has a specific role based on style name
+        # @param paragraph [Uniword::Wordprocessingml::Paragraph]
+        # @return [Symbol, nil]
+        def role_from_style(paragraph)
+          style_name = resolve_style_name(paragraph)
+          return nil unless style_name
+          case style_name
+          when QUOTE_PATTERN then :quote
+          when CODE_PATTERN then :source
+          when LITERAL_PATTERN then :literal
+          when EXAMPLE_PATTERN then :example
+          end
+        end
+        # Detect semantic role of a run based on its rStyle
+        # @param run [Uniword::Wordprocessingml::Run]
+        # @return [Symbol, nil]
+        def run_semantic_role(run)
+          return nil unless run.properties
+          return nil unless run.properties.style
+          style_name = resolve_run_style_name(run)
+          return nil unless style_name
+          case style_name
+          when /\b(code|verbatim|teletype|keyboard)\b/i then :monospace
+          when /\bstrong\b/i then :bold
+          when /\b(emphasis|em)\b/i then :italic
+          when /\bcitation\b/i then :italic
+          end
+        end
+        private
+        def resolve_style_name(paragraph)
+          style_ref = paragraph.properties&.style
+          return nil unless style_ref
+          value = style_ref.is_a?(Uniword::Wordprocessingml::PStyle) ? style_ref.val : style_ref.to_s
+          return nil unless value
+          mapped = @style_map[value]
+          return mapped if mapped
+          value
+        end
+        def resolve_run_style_name(run)
+          style_ref = run.properties.style
+          return nil unless style_ref
+          value = style_ref.is_a?(Uniword::Wordprocessingml::PStyle) ? style_ref.val : style_ref.to_s
+          return nil unless value
+          mapped = @style_map[value]
+          mapped || value
+        end
+        def find_style_for_paragraph(paragraph)
+          return nil unless @config
+          style_id = style_id_from_paragraph(paragraph)
+          return nil unless style_id
+          return unless @config.is_a?(Uniword::Wordprocessingml::StylesConfiguration)
+          @config.style_by_id(style_id)
+        end
+        def style_id_from_paragraph(paragraph)
+          style_ref = paragraph.properties&.style
+          return nil unless style_ref
+          style_ref.is_a?(Uniword::Wordprocessingml::PStyle) ? style_ref.val : style_ref.to_s
+        end
+        def build_style_map(config)
+          return {} unless config
+          return {} unless config.is_a?(Uniword::Wordprocessingml::StylesConfiguration)
+          map = {}
+          config.styles.each do |style|
+            id = style.styleId
+            name = extract_style_name(style)
+            next unless id && name
+            map[id] = name
+            if heading_by_based_on?(config, style)
+              level = heading_level_from_chain(config, style)
+              map[id] = "Heading#{level}"
+            end
+          end
+          map
+        end
+        def extract_style_name(style)
+          sn = style.style_name
+          return sn if sn
+          name = style.name
+          return nil unless name
+          name.is_a?(Uniword::Wordprocessingml::StyleName) ? name.val.to_s : name.to_s
+        end
+        def heading_by_based_on?(config, style)
+          based_on = style.based_on
+          return false unless based_on
+          visited = Set.new
+          current = style
+          while current && !visited.include?(current.styleId)
+            visited << current.styleId
+            parent_id = current.based_on
+            return true if parent_id && HEADING_PATTERN.match?(parent_id)
+            break unless parent_id
+            current = config.styles.find { |s| s.styleId == parent_id }
+          end
+          false
+        end
+        def heading_level_from_chain(config, style)
+          visited = Set.new
+          current = style
+          while current && !visited.include?(current.styleId)
+            visited << current.styleId
+            name = extract_style_name(current)
+            if name
+              match = HEADING_PATTERN.match(name)
+              return match[2].to_i if match
+            end
+            parent_id = current.based_on
+            break unless parent_id
+            current = config.styles.find { |s| s.styleId == parent_id }
+          end
+          1
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform/to_core_model.rb ADDED Viewed

@@ -0,0 +1,340 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      # Orchestrator for OOXML → CoreModel transformation.
+      #
+      # Walks a Uniword::Wordprocessingml::DocumentRoot tree and dispatches
+      # to registered transform rules. Handles:
+      #
+      # - Style-based heading detection (via StyleResolver)
+      # - List grouping (consecutive numPr paragraphs → single ListBlock)
+      # - Footnote content collection
+      # - Image reference tracking
+      # - Bookmark ID propagation
+      #
+      # Dispatch strategy:
+      # - HeadingRule and ListItemRule are dispatched directly by the
+      #   orchestrator (they need context for style resolution).
+      # - All other element types are dispatched via RuleRegistry.
+      #
+      # @example Transform a DOCX document
+      #   doc = Uniword::DocumentFactory.from_file("input.docx")
+      #   core = ToCoreModel.transform(doc)
+      #   # => Coradoc::CoreModel::StructuralElement
+      class ToCoreModel
+        class << self
+          def transform(document)
+            new.transform(document)
+          end
+        end
+        def transform(document)
+          registry = build_registry
+          context = Context.new(
+            styles_configuration: document.styles_configuration,
+            numbering_configuration: document.numbering_configuration,
+            footnotes: collect_footnotes(document),
+            registry: registry
+          )
+          @heading_rule = Rules::HeadingRule.new
+          @list_item_rule = Rules::ListItemRule.new
+          body = document.body
+          doc_title = extract_document_title(document, context)
+          children = transform_elements(body, context)
+          # If the first child is an H1 matching the doc title, skip the
+          # duplicate — the document title already captures it
+          if doc_title && children.first.is_a?(Coradoc::CoreModel::StructuralElement) &&
+             children.first.section? &&
+             children.first.title == doc_title &&
+             children.first.level == 1
+            children.shift
+          end
+          doc = Coradoc::CoreModel::StructuralElement.new(
+            element_type: 'document',
+            title: doc_title,
+            children: children
+          )
+          # Extract semantic content from headers/footers
+          extract_header_footer_metadata(document, doc)
+          doc
+        end
+        private
+        # Walk body elements with list grouping look-ahead
+        def transform_elements(body, context)
+          return [] unless body
+          elements = body_ordered_elements(body)
+          result = []
+          i = 0
+          while i < elements.length
+            element = elements[i]
+            transformed = dispatch_element(element, i, elements, context)
+            case transformed
+            when Array
+              consumed = transformed.length
+              result.concat(transformed.compact)
+              i += consumed
+            when nil
+              i += 1
+            else
+              result << transformed
+              i += 1
+            end
+          end
+          result
+        end
+        # Dispatch a single body element, handling paragraphs specially
+        def dispatch_element(element, index, elements, context)
+          # Paragraphs need style-based dispatch (heading, list, or plain)
+          return dispatch_paragraph(element, index, elements, context) if paragraph?(element)
+          # Tables go through registry directly
+          context.transform(element)
+        end
+        def dispatch_paragraph(paragraph, index, elements, context)
+          resolver = context.style_resolver
+          # Check for section break in paragraph properties
+          if section_break?(paragraph)
+            # Section break without heading → thematic break
+            return section_break_element(paragraph, context)
+          end
+          # Heading
+          return @heading_rule.apply(paragraph, context) if resolver.heading?(paragraph)
+          # List item — group consecutive items with same numId
+          return group_list(elements, index, context) if resolver.list_item?(paragraph)
+          # Regular paragraph (via registry)
+          context.transform(paragraph)
+        end
+        def section_break?(paragraph)
+          return false unless paragraph.is_a?(Uniword::Wordprocessingml::Paragraph)
+          return false unless paragraph.properties
+          sect_pr = paragraph.properties.section_properties
+          return false unless sect_pr
+          sect_pr.type ? true : false
+        end
+        def section_break_element(paragraph, context)
+          # First, transform the paragraph content if it has text
+          content = paragraph.runs&.map { |r| r.text&.content.to_s }&.join
+          if content && !content.strip.empty?
+            # Has content — transform normally (content comes before the break)
+            context.transform(paragraph)
+          else
+            # Standalone section break → thematic break
+            Coradoc::CoreModel::Block.new(
+              element_type: 'thematic_break'
+            )
+          end
+        end
+        # Collect consecutive list items with the same numId into a ListBlock
+        def group_list(elements, start_index, context)
+          first = elements[start_index]
+          num_id = first.properties&.num_id.to_i
+          items = []
+          consumed = 0
+          idx = start_index
+          while idx < elements.length
+            para = elements[idx]
+            break unless paragraph?(para)
+            break unless context.style_resolver.list_item?(para)
+            break unless para.properties&.num_id.to_i == num_id
+            items << @list_item_rule.apply(para, context)
+            consumed += 1
+            idx += 1
+          end
+          list_block = Coradoc::CoreModel::ListBlock.new(
+            marker_type: context.numbering_resolver.marker_type(num_id),
+            items: items
+          )
+          # Return array so caller knows how many elements were consumed
+          consumed > 1 ? [list_block] + Array.new(consumed - 1, nil) : list_block
+        end
+        def body_ordered_elements(body)
+          order = body.is_a?(Uniword::Wordprocessingml::Body) ? body.element_order : nil
+          return body.elements if order.nil? || order.empty?
+          p_idx = tbl_idx = sdt_idx = 0
+          order.filter_map do |entry|
+            name = entry.is_a?(String) ? entry : entry.name
+            case name
+            when 'p'
+              para = body.paragraphs[p_idx]
+              p_idx += 1
+              para
+            when 'tbl'
+              tbl = body.tables[tbl_idx]
+              tbl_idx += 1
+              tbl
+            when 'sdt'
+              sdt = body.structured_document_tags&.[](sdt_idx)
+              sdt_idx += 1
+              sdt
+            end
+          end
+        end
+        def paragraph?(element)
+          defined?(Uniword::Wordprocessingml::Paragraph) &&
+            element.is_a?(Uniword::Wordprocessingml::Paragraph)
+        end
+        def collect_footnotes(document)
+          footnotes = {}
+          doc_footnotes = document.footnotes
+          if doc_footnotes.is_a?(Hash)
+            doc_footnotes.each do |id, fn|
+              paragraphs = fn.is_a?(Uniword::Wordprocessingml::Footnote) ? fn.paragraphs : fn[:content]
+              footnotes[id.to_s] = Array(paragraphs)
+            end
+          end
+          if defined?(Uniword::Wordprocessingml::Footnotes) &&
+             document.footnotes.is_a?(Uniword::Wordprocessingml::Footnotes)
+            document.footnotes.footnotes.each do |fn|
+              id = fn.id&.to_s
+              next unless id
+              footnotes[id] = Array(fn.paragraphs || [])
+            end
+          end
+          footnotes
+        end
+        def extract_document_title(document, context)
+          body = document.body
+          return nil unless body
+          paragraphs = body.paragraphs || []
+          paragraphs.each do |para|
+            next unless context.style_resolver.heading?(para)
+            next unless context.style_resolver.heading_level(para) == 1
+            runs = para.runs || []
+            return runs.map { |r| r.text&.content.to_s }.join
+          end
+          nil
+        end
+        def build_registry
+          registry = RuleRegistry.new
+          # Only register rules that don't need context for dispatch
+          registry.register(Rules::ParagraphRule.new)
+          registry.register(Rules::RunRule.new)
+          registry.register(Rules::TextRule.new)
+          registry.register(Rules::BreakRule.new)
+          registry.register(Rules::HyperlinkRule.new)
+          registry.register(Rules::ImageRule.new)
+          registry.register(Rules::FootnoteRule.new)
+          registry.register(Rules::BookmarkRule.new)
+          registry.register(Rules::TableRule.new)
+          registry.register(Rules::MathRule.new)
+          registry.register(Rules::StructuredDocumentTagRule.new)
+          registry.register(Rules::SimpleFieldRule.new)
+          registry.register(Rules::ProofErrorRule.new)
+          registry
+        end
+        # Extract semantic text from headers and footers.
+        # Discards purely layout text ("Page X of Y", page numbers, dates).
+        # Preserves meaningful text (title, version, confidentiality notices).
+        def extract_header_footer_metadata(document, core_doc)
+          extract_from_parts(document, :headers, 'header', core_doc)
+          extract_from_parts(document, :footers, 'footer', core_doc)
+        end
+        def extract_from_parts(document, method, prefix, core_doc)
+          parts = case method
+                  when :headers then document.headers
+                  when :footers then document.footers
+                  end
+          return unless parts
+          Array(parts).each_with_index do |part, idx|
+            text = extract_part_text(part)
+            next if text.nil? || text.strip.empty?
+            next if layout_only_text?(text)
+            part_type = if part.is_a?(Uniword::Wordprocessingml::Header) ||
+                           part.is_a?(Uniword::Wordprocessingml::Footer)
+                          part.type
+                        else
+                          idx
+                        end
+            core_doc.set_metadata("docx.#{prefix}.#{part_type}", text.strip)
+          end
+        end
+        def extract_part_text(part)
+          paragraphs = part.paragraphs || []
+          return nil unless paragraphs
+          paragraphs.map do |para|
+            extract_paragraph_text_content(para)
+          end.compact.join(' ').strip
+        end
+        def extract_paragraph_text_content(para)
+          runs = para.runs || []
+          return nil unless runs
+          runs.map { |r| r.text&.content.to_s }.join
+        end
+        # Check if header/footer text is purely layout content
+        # (page numbers, "Page X of Y", dates, etc.)
+        def layout_only_text?(text)
+          stripped = text.strip
+          return true if stripped.empty?
+          # Pure numbers (page numbers)
+          return true if stripped.match?(/\A\d+\z/)
+          # Common page number patterns
+          return true if stripped.match?(/\APage\s+\d+(\s+of\s+\d+)?\z/i)
+          # Pure date patterns
+          return true if stripped.match?(%r{\A\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\z})
+          return true if stripped.match?(%r{\A\d{4}[/-]\d{1,2}[/-]\d{1,2}\z})
+          false
+        end
+      end
+    end
+  end
+end

data/lib/coradoc/docx/transform.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    module Transform
+      autoload :Rule, 'coradoc/docx/transform/rule'
+      autoload :RuleRegistry, 'coradoc/docx/transform/rule_registry'
+      autoload :Context, 'coradoc/docx/transform/context'
+      autoload :ToCoreModel, 'coradoc/docx/transform/to_core_model'
+      autoload :FromCoreModel, 'coradoc/docx/transform/from_core_model'
+      autoload :StyleResolver, 'coradoc/docx/transform/style_resolver'
+      autoload :NumberingResolver, 'coradoc/docx/transform/numbering_resolver'
+      autoload :OrderedContent, 'coradoc/docx/transform/ordered_content'
+      # Element transform rules
+      module Rules
+        autoload :TextRule, 'coradoc/docx/transform/rules/text_rule'
+        autoload :BreakRule, 'coradoc/docx/transform/rules/break_rule'
+        autoload :RunRule, 'coradoc/docx/transform/rules/run_rule'
+        autoload :HyperlinkRule, 'coradoc/docx/transform/rules/hyperlink_rule'
+        autoload :ImageRule, 'coradoc/docx/transform/rules/image_rule'
+        autoload :FootnoteRule, 'coradoc/docx/transform/rules/footnote_rule'
+        autoload :HeadingRule, 'coradoc/docx/transform/rules/heading_rule'
+        autoload :ListItemRule, 'coradoc/docx/transform/rules/list_item_rule'
+        autoload :ParagraphRule, 'coradoc/docx/transform/rules/paragraph_rule'
+        autoload :TableRule, 'coradoc/docx/transform/rules/table_rule'
+        autoload :MathRule, 'coradoc/docx/transform/rules/math_rule'
+        autoload :BookmarkRule, 'coradoc/docx/transform/rules/bookmark_rule'
+        autoload :StructuredDocumentTagRule,
+                 'coradoc/docx/transform/rules/structured_document_tag_rule'
+        autoload :SimpleFieldRule,
+                 'coradoc/docx/transform/rules/simple_field_rule'
+        autoload :ProofErrorRule,
+                 'coradoc/docx/transform/rules/proof_error_rule'
+      end
+    end
+  end
+end

data/lib/coradoc/docx/version.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+module Coradoc
+  module Docx
+    VERSION = '0.1.0'
+  end
+end