coradoc-docx 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.adoc +164 -0
- data/lib/coradoc/docx/transform/context.rb +72 -0
- data/lib/coradoc/docx/transform/from_core_model.rb +577 -0
- data/lib/coradoc/docx/transform/numbering_resolver.rb +127 -0
- data/lib/coradoc/docx/transform/ordered_content.rb +95 -0
- data/lib/coradoc/docx/transform/rule.rb +57 -0
- data/lib/coradoc/docx/transform/rule_registry.rb +60 -0
- data/lib/coradoc/docx/transform/rules/bookmark_rule.rb +34 -0
- data/lib/coradoc/docx/transform/rules/break_rule.rb +30 -0
- data/lib/coradoc/docx/transform/rules/footnote_rule.rb +27 -0
- data/lib/coradoc/docx/transform/rules/heading_rule.rb +53 -0
- data/lib/coradoc/docx/transform/rules/hyperlink_rule.rb +58 -0
- data/lib/coradoc/docx/transform/rules/image_rule.rb +125 -0
- data/lib/coradoc/docx/transform/rules/list_item_rule.rb +47 -0
- data/lib/coradoc/docx/transform/rules/math_rule.rb +82 -0
- data/lib/coradoc/docx/transform/rules/paragraph_rule.rb +65 -0
- data/lib/coradoc/docx/transform/rules/proof_error_rule.rb +25 -0
- data/lib/coradoc/docx/transform/rules/run_rule.rb +189 -0
- data/lib/coradoc/docx/transform/rules/simple_field_rule.rb +87 -0
- data/lib/coradoc/docx/transform/rules/structured_document_tag_rule.rb +36 -0
- data/lib/coradoc/docx/transform/rules/table_rule.rb +85 -0
- data/lib/coradoc/docx/transform/rules/text_rule.rb +25 -0
- data/lib/coradoc/docx/transform/style_resolver.rb +249 -0
- data/lib/coradoc/docx/transform/to_core_model.rb +340 -0
- data/lib/coradoc/docx/transform.rb +38 -0
- data/lib/coradoc/docx/version.rb +7 -0
- data/lib/coradoc/docx.rb +99 -0
- metadata +155 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 3c3d4bb0be28a10758044449cbd23105ebf3b954813322561ae3f338af085e4a
|
|
4
|
+
data.tar.gz: 8fed48bf56249e71de9e8117f4345f25f6828558c3a1067ee37a786fa52e8506
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: b9ea07f47f7468155c3c3aa4e502284193c54609006ffb2768659fca3f0e77e8abd4edb7274bf07a6cf50f06c5922bd119d966f9ba5dfa1de419c65833619537
|
|
7
|
+
data.tar.gz: f323b453647188ac9ffb716c82263e9bb13bce2ca24efdb6eead3562f01925f07769d39d6f369de37d14e1ef464ea47d07369bbbcf716fb467f5d22ab5c8eceb
|
data/README.adoc
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
= coradoc-docx
|
|
2
|
+
image:https://img.shields.io/gem/v/coradoc-docx.svg[RubyGems Version]
|
|
3
|
+
|
|
4
|
+
DOCX (OOXML) format support for the
|
|
5
|
+
https://github.com/lutaml/coradoc[Coradoc] document transformation library.
|
|
6
|
+
|
|
7
|
+
== Purpose
|
|
8
|
+
|
|
9
|
+
coradoc-docx reads Microsoft Word `.docx` files via
|
|
10
|
+
https://github.com/lutaml/uniword[Uniword] and transforms the OOXML model
|
|
11
|
+
tree into Coradoc's canonical CoreModel. Once in CoreModel, the document can
|
|
12
|
+
be serialized to AsciiDoc, Markdown, or any other supported output format.
|
|
13
|
+
|
|
14
|
+
== Installation
|
|
15
|
+
|
|
16
|
+
Add to your Gemfile:
|
|
17
|
+
|
|
18
|
+
[source,ruby]
|
|
19
|
+
----
|
|
20
|
+
gem 'coradoc-docx'
|
|
21
|
+
----
|
|
22
|
+
|
|
23
|
+
Or install directly:
|
|
24
|
+
|
|
25
|
+
[source,bash]
|
|
26
|
+
----
|
|
27
|
+
gem install coradoc-docx
|
|
28
|
+
----
|
|
29
|
+
|
|
30
|
+
The gem depends on `coradoc` and `uniword`, which will be installed
|
|
31
|
+
automatically.
|
|
32
|
+
|
|
33
|
+
== Usage
|
|
34
|
+
|
|
35
|
+
=== Convert DOCX to AsciiDoc
|
|
36
|
+
|
|
37
|
+
[source,ruby]
|
|
38
|
+
----
|
|
39
|
+
require 'coradoc'
|
|
40
|
+
require 'coradoc/docx'
|
|
41
|
+
|
|
42
|
+
adoc = Coradoc.convert("input.docx", from: :docx, to: :asciidoc)
|
|
43
|
+
----
|
|
44
|
+
|
|
45
|
+
=== Convert DOCX to Markdown
|
|
46
|
+
|
|
47
|
+
[source,ruby]
|
|
48
|
+
----
|
|
49
|
+
md = Coradoc.convert("input.docx", from: :docx, to: :markdown)
|
|
50
|
+
----
|
|
51
|
+
|
|
52
|
+
=== Parse DOCX to CoreModel
|
|
53
|
+
|
|
54
|
+
[source,ruby]
|
|
55
|
+
----
|
|
56
|
+
core = Coradoc.parse("input.docx", format: :docx)
|
|
57
|
+
|
|
58
|
+
core.title # => "Document Title"
|
|
59
|
+
core.children # => Array of sections, paragraphs, tables, etc.
|
|
60
|
+
|
|
61
|
+
# Serialize to any format
|
|
62
|
+
adoc = Coradoc.serialize(core, to: :asciidoc)
|
|
63
|
+
html = Coradoc.serialize(core, to: :html)
|
|
64
|
+
----
|
|
65
|
+
|
|
66
|
+
=== CLI
|
|
67
|
+
|
|
68
|
+
[source,bash]
|
|
69
|
+
----
|
|
70
|
+
# Convert DOCX to AsciiDoc
|
|
71
|
+
coradoc convert document.docx -o output.adoc
|
|
72
|
+
|
|
73
|
+
# Convert DOCX to Markdown
|
|
74
|
+
coradoc convert document.docx -o output.md
|
|
75
|
+
----
|
|
76
|
+
|
|
77
|
+
== How It Works
|
|
78
|
+
|
|
79
|
+
The DOCX pipeline uses Uniword to parse the OOXML zip archive into a typed
|
|
80
|
+
model tree, then transforms it to CoreModel:
|
|
81
|
+
|
|
82
|
+
....
|
|
83
|
+
DOCX file
|
|
84
|
+
→ Uniword::DocumentFactory.from_file
|
|
85
|
+
→ OOXML model tree (Uniword::Wordprocessingml::*)
|
|
86
|
+
→ Coradoc::Docx::Transform::ToCoreModel (rule-based dispatch)
|
|
87
|
+
→ CoreModel tree (canonical hub)
|
|
88
|
+
→ FromCoreModel (AsciiDoc or Markdown)
|
|
89
|
+
→ Format model tree → Serializer → .adoc or .md file
|
|
90
|
+
....
|
|
91
|
+
|
|
92
|
+
The transform uses a **rule registry** with priority-based dispatch. Each
|
|
93
|
+
OOXML element type has a dedicated rule class that produces a typed CoreModel
|
|
94
|
+
node. Style-based semantic detection (headings, lists, quotes) is handled by
|
|
95
|
+
`StyleResolver` and `NumberingResolver`.
|
|
96
|
+
|
|
97
|
+
=== Supported OOXML Elements
|
|
98
|
+
|
|
99
|
+
| OOXML Element | Style/Condition | CoreModel Target |
|
|
100
|
+
|---------------|-----------------|------------------|
|
|
101
|
+
| `w:p` (Heading style) | pStyle=HeadingN | StructuralElement (section) |
|
|
102
|
+
| `w:p` (numPr) | numbering reference | ListBlock + ListItem |
|
|
103
|
+
| `w:p` (Quote style) | style detection | Block (quote) |
|
|
104
|
+
| `w:p` (Code style) | style detection | Block (source/listing) |
|
|
105
|
+
| `w:p` (default) | - | Block (paragraph) |
|
|
106
|
+
| `w:r` (bold) | rPr/bold | InlineElement (bold) |
|
|
107
|
+
| `w:r` (italic) | rPr/italic | InlineElement (italic) |
|
|
108
|
+
| `w:r` (underline) | rPr/underline | InlineElement (underline) |
|
|
109
|
+
| `w:r` (strike) | rPr/strike | InlineElement (strikethrough) |
|
|
110
|
+
| `w:r` (sub/sup) | rPr/vertAlign | InlineElement (subscript/superscript) |
|
|
111
|
+
| `w:hyperlink` | r:id or w:anchor | InlineElement (link) |
|
|
112
|
+
| `w:tbl` | - | Table |
|
|
113
|
+
| `w:drawing` / `w:pict` | inline or anchor | Image |
|
|
114
|
+
| `m:oMathPara` / `m:oMath` | - | Block or InlineElement (stem) |
|
|
115
|
+
| `w:footnoteReference` | - | FootnoteReference |
|
|
116
|
+
|
|
117
|
+
== Limitations
|
|
118
|
+
|
|
119
|
+
* **Parse only** -- DOCX to CoreModel is supported; CoreModel to DOCX is not
|
|
120
|
+
yet implemented.
|
|
121
|
+
* **Print layout** -- Page size, margins, headers/footers as page regions are
|
|
122
|
+
discarded (CoreModel is semantic, not print layout).
|
|
123
|
+
* **Complex fields** -- Field characters (TOC, PAGE) are partially handled.
|
|
124
|
+
* **Tracked changes** -- Deleted text is currently skipped.
|
|
125
|
+
* **VML shapes** -- Only inline drawings are extracted.
|
|
126
|
+
|
|
127
|
+
== Architecture
|
|
128
|
+
|
|
129
|
+
=== Rule Classes
|
|
130
|
+
|
|
131
|
+
Each OOXML element type is handled by a dedicated rule class in
|
|
132
|
+
`Coradoc::Docx::Transform::Rules`:
|
|
133
|
+
|
|
134
|
+
- `HeadingRule` -- Detects heading paragraphs via style or outline level
|
|
135
|
+
- `ListItemRule` -- Detects numbered/bulleted paragraphs via numbering resolver
|
|
136
|
+
- `ParagraphRule` -- Default paragraph transform
|
|
137
|
+
- `RunRule` -- Inline formatting (bold, italic, monospace, links, etc.)
|
|
138
|
+
- `TableRule` -- Table structure with rowspan/colspan
|
|
139
|
+
- `HyperlinkRule` -- External links and bookmarks
|
|
140
|
+
- `ImageRule` -- Inline and anchored drawings
|
|
141
|
+
- `FootnoteRule` -- Footnote references
|
|
142
|
+
- `MathRule` -- OMML math via Plurimath/LaTeX
|
|
143
|
+
|
|
144
|
+
=== Resolvers
|
|
145
|
+
|
|
146
|
+
- `StyleResolver` -- Walks the OOXML style definitions to detect semantic
|
|
147
|
+
roles (heading levels, code style, quote style) including `basedOn` chains
|
|
148
|
+
- `NumberingResolver` -- Resolves numbering definitions to detect ordered vs.
|
|
149
|
+
unordered lists and their marker types
|
|
150
|
+
|
|
151
|
+
== Development
|
|
152
|
+
|
|
153
|
+
Run tests:
|
|
154
|
+
|
|
155
|
+
[source,bash]
|
|
156
|
+
----
|
|
157
|
+
bundle exec rake spec:coradoc_docx
|
|
158
|
+
----
|
|
159
|
+
|
|
160
|
+
== License
|
|
161
|
+
|
|
162
|
+
Copyright:: 2024-2026 Ribose Inc.
|
|
163
|
+
|
|
164
|
+
Licensed under the https://www.apache.org/licenses/LICENSE-2.0.html[Apache License, Version 2.0].
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coradoc
|
|
4
|
+
module Docx
|
|
5
|
+
module Transform
|
|
6
|
+
# Shared context for a single OOXML → CoreModel transform pass.
|
|
7
|
+
#
|
|
8
|
+
# Holds resolvers, footnote content, image references, and the
|
|
9
|
+
# rule registry. Passed to every rule's #apply method so rules
|
|
10
|
+
# can delegate sub-transforms (e.g., transform runs inside a
|
|
11
|
+
# paragraph).
|
|
12
|
+
class Context
|
|
13
|
+
attr_reader :style_resolver, :numbering_resolver,
|
|
14
|
+
:footnotes, :image_refs, :registry
|
|
15
|
+
|
|
16
|
+
# @param styles_configuration [Object, nil] Uniword styles config
|
|
17
|
+
# @param numbering_configuration [Object, nil] Uniword numbering config
|
|
18
|
+
# @param footnotes [Hash{String => Array}] footnote id → content paragraphs
|
|
19
|
+
# @param registry [RuleRegistry] rule dispatch registry
|
|
20
|
+
def initialize(styles_configuration: nil, numbering_configuration: nil,
|
|
21
|
+
footnotes: {}, registry: nil)
|
|
22
|
+
@style_resolver = StyleResolver.new(styles_configuration)
|
|
23
|
+
@numbering_resolver = NumberingResolver.new(numbering_configuration)
|
|
24
|
+
@footnotes = footnotes
|
|
25
|
+
@image_refs = []
|
|
26
|
+
@registry = registry
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Transform an element using the registry
|
|
30
|
+
#
|
|
31
|
+
# @param element [Object] OOXML element
|
|
32
|
+
# @return [Coradoc::CoreModel::Base, Array, String, nil]
|
|
33
|
+
def transform(element)
|
|
34
|
+
return nil if element.nil?
|
|
35
|
+
|
|
36
|
+
rule = @registry.find_rule(element)
|
|
37
|
+
rule.apply(element, self)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Fetch footnote content by ID
|
|
41
|
+
#
|
|
42
|
+
# @param id [String, Integer] footnote ID
|
|
43
|
+
# @return [String, nil] footnote text content
|
|
44
|
+
def footnote_content(id)
|
|
45
|
+
return nil unless id
|
|
46
|
+
|
|
47
|
+
paragraphs = @footnotes[id.to_s]
|
|
48
|
+
return nil unless paragraphs
|
|
49
|
+
|
|
50
|
+
paragraphs.map do |para|
|
|
51
|
+
extract_paragraph_text(para)
|
|
52
|
+
end.join("\n")
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Record an image reference for later extraction
|
|
56
|
+
#
|
|
57
|
+
# @param ref [Hash] image reference with :src, :alt, etc.
|
|
58
|
+
def register_image(ref)
|
|
59
|
+
@image_refs << ref
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def extract_paragraph_text(paragraph)
|
|
65
|
+
return '' unless paragraph.is_a?(Uniword::Wordprocessingml::Paragraph) || paragraph.is_a?(Uniword::Wordprocessingml::Run)
|
|
66
|
+
|
|
67
|
+
paragraph.text.to_s
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|