coradoc-markdown 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/lib/coradoc/markdown/errors.rb +28 -0
  4. data/lib/coradoc/markdown/model/abbreviation.rb +27 -0
  5. data/lib/coradoc/markdown/model/attribute_list.rb +98 -0
  6. data/lib/coradoc/markdown/model/base.rb +86 -0
  7. data/lib/coradoc/markdown/model/blockquote.rb +21 -0
  8. data/lib/coradoc/markdown/model/code.rb +11 -0
  9. data/lib/coradoc/markdown/model/code_block.rb +24 -0
  10. data/lib/coradoc/markdown/model/definition_item.rb +24 -0
  11. data/lib/coradoc/markdown/model/definition_list.rb +47 -0
  12. data/lib/coradoc/markdown/model/definition_term.rb +21 -0
  13. data/lib/coradoc/markdown/model/document.rb +39 -0
  14. data/lib/coradoc/markdown/model/emphasis.rb +11 -0
  15. data/lib/coradoc/markdown/model/extension.rb +92 -0
  16. data/lib/coradoc/markdown/model/footnote.rb +31 -0
  17. data/lib/coradoc/markdown/model/footnote_reference.rb +22 -0
  18. data/lib/coradoc/markdown/model/heading.rb +44 -0
  19. data/lib/coradoc/markdown/model/highlight.rb +18 -0
  20. data/lib/coradoc/markdown/model/horizontal_rule.rb +16 -0
  21. data/lib/coradoc/markdown/model/image.rb +19 -0
  22. data/lib/coradoc/markdown/model/link.rb +19 -0
  23. data/lib/coradoc/markdown/model/list.rb +22 -0
  24. data/lib/coradoc/markdown/model/list_item.rb +29 -0
  25. data/lib/coradoc/markdown/model/math.rb +50 -0
  26. data/lib/coradoc/markdown/model/paragraph.rb +28 -0
  27. data/lib/coradoc/markdown/model/strikethrough.rb +18 -0
  28. data/lib/coradoc/markdown/model/strong.rb +11 -0
  29. data/lib/coradoc/markdown/model/table.rb +13 -0
  30. data/lib/coradoc/markdown/model/text.rb +15 -0
  31. data/lib/coradoc/markdown/parser/ast_processor.rb +543 -0
  32. data/lib/coradoc/markdown/parser/block_parser.rb +745 -0
  33. data/lib/coradoc/markdown/parser/html_entities.rb +2149 -0
  34. data/lib/coradoc/markdown/parser/inline_parser.rb +274 -0
  35. data/lib/coradoc/markdown/parser/parslet_extras.rb +215 -0
  36. data/lib/coradoc/markdown/parser.rb +11 -0
  37. data/lib/coradoc/markdown/parser_util.rb +90 -0
  38. data/lib/coradoc/markdown/serializer.rb +199 -0
  39. data/lib/coradoc/markdown/toc_generator.rb +215 -0
  40. data/lib/coradoc/markdown/transform/from_core_model.rb +325 -0
  41. data/lib/coradoc/markdown/transform/text_extraction.rb +19 -0
  42. data/lib/coradoc/markdown/transform/to_core_model.rb +287 -0
  43. data/lib/coradoc/markdown/transformer.rb +463 -0
  44. data/lib/coradoc/markdown/version.rb +7 -0
  45. data/lib/coradoc/markdown.rb +190 -0
  46. metadata +173 -0
@@ -0,0 +1,543 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coradoc
4
+ module Markdown
5
+ module Parser
6
+ autoload :InlineParser, "#{__dir__}/inline_parser"
7
+
8
+ # Post-processes the AST produced by BlockParser.
9
+ #
10
+ # This processor handles:
11
+ # - Escape sequence processing (\# -> #, \* -> *, etc.)
12
+ # - Hard line break detection (two+ spaces at end of line)
13
+ # - Inline element parsing (emphasis, code spans, etc.)
14
+ #
15
+ class AstProcessor
16
+ # Characters that can be escaped in Markdown
17
+ ESCAPABLE_CHARS = %w[
18
+ ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~
19
+ ].freeze
20
+
21
+ class << self
22
+ # Process the AST, applying all post-processing rules
23
+ #
24
+ # @param ast [Array] The parsed AST from BlockParser
25
+ # @param parse_inlines [Boolean] Whether to parse inline elements
26
+ # @return [Array] The processed AST
27
+ def process(ast, parse_inlines: true)
28
+ return ast if ast.nil?
29
+
30
+ result = process_node(ast)
31
+ result = process_inlines(result) if parse_inlines
32
+ result
33
+ end
34
+
35
+ # Extract inline Kramdown elements from text
36
+ # Returns an array of elements: text, footnote references, etc.
37
+ def extract_inline_elements(text)
38
+ return [text] if text.nil? || text.empty?
39
+
40
+ elements = []
41
+ remaining = text
42
+
43
+ # Pattern for footnote reference: [^name]
44
+ fn_pattern = /\[\^([^\]]+)\]/
45
+
46
+ until remaining.empty?
47
+ match = remaining.match(fn_pattern)
48
+ if match
49
+ # Add text before the match
50
+ elements << match.pre_match unless match.pre_match.empty?
51
+ # Add the footnote reference
52
+ elements << { fn_ref: match[1] }
53
+ remaining = match.post_match
54
+ else
55
+ # No more matches - add remaining text
56
+ elements << remaining
57
+ break
58
+ end
59
+ end
60
+
61
+ elements.length == 1 ? elements.first : elements
62
+ end
63
+
64
+ # Apply typography substitutions (Kramdown extension)
65
+ # - -- to en-dash (–)
66
+ # - --- to em-dash (—)
67
+ # - ... to ellipsis (…)
68
+ def apply_typography(text)
69
+ return text if text.nil?
70
+
71
+ result = text.to_s
72
+ # Order matters: --- before --
73
+ result = result.gsub('---', '—') # em-dash
74
+ result = result.gsub('--', '–') # en-dash
75
+ result.gsub('...', '…') # ellipsis
76
+ end
77
+
78
+ private
79
+
80
+ # Process a single node in the AST
81
+ def process_node(node)
82
+ case node
83
+ when Array
84
+ node.map { |child| process_node(child) }
85
+ when Hash
86
+ process_hash(node)
87
+ when Parslet::Slice
88
+ # Process escape sequences in Parslet::Slice values
89
+ process_escapes(node.to_s)
90
+ else
91
+ node
92
+ end
93
+ end
94
+
95
+ # Process a hash node
96
+ def process_hash(hash)
97
+ result = {}
98
+
99
+ hash.each do |key, value|
100
+ result[key] = case key
101
+ when :ln
102
+ # Process line content - detect hard line breaks and escape sequences
103
+ process_line_content(value)
104
+ when :text
105
+ # Process text content - may be a Hash, Array, or string
106
+ process_node(value)
107
+ when :p
108
+ # Process paragraph content - may contain hard line breaks
109
+ process_paragraph_content(value)
110
+ when :cell
111
+ # Process table cell - strip trailing whitespace
112
+ process_table_cell(value)
113
+ when :sep
114
+ # Process table separator - normalize to alignment indicator
115
+ process_table_separator(value)
116
+ when :table_header, :table_body, :table_body_row, :table_separator
117
+ # Process table elements recursively
118
+ process_node(value)
119
+ else
120
+ process_node(value)
121
+ end
122
+ end
123
+
124
+ result
125
+ end
126
+
127
+ # Process table cell content - strip trailing whitespace
128
+ def process_table_cell(value)
129
+ return value if value.nil?
130
+
131
+ text = value.to_s.strip
132
+ process_escapes(text)
133
+ end
134
+
135
+ # Process table separator - normalize to alignment indicator
136
+ # "----------" -> "-" (no alignment)
137
+ # ":-----" -> ":-" (left align)
138
+ # "------:" -> "-:" (right align)
139
+ # ":----:" -> ":-:" (center align)
140
+ def process_table_separator(value)
141
+ return value if value.nil?
142
+
143
+ text = value.to_s
144
+ has_left_colon = text.start_with?(':')
145
+ has_right_colon = text.end_with?(':')
146
+
147
+ if has_left_colon && has_right_colon
148
+ ':-:'
149
+ elsif has_left_colon
150
+ ':-'
151
+ elsif has_right_colon
152
+ '-:'
153
+ else
154
+ '-'
155
+ end
156
+ end
157
+
158
+ # Process line content
159
+ # Detects hard line breaks (2+ trailing spaces) and escape sequences
160
+ def process_line_content(value)
161
+ return value if value.nil?
162
+
163
+ case value
164
+ when Parslet::Slice
165
+ end
166
+ text = value.to_s
167
+
168
+ process_escapes(text)
169
+ end
170
+
171
+ # Process paragraph content
172
+ # Handles arrays of lines and detects hard line breaks
173
+ def process_paragraph_content(value)
174
+ return value if value.nil?
175
+
176
+ case value
177
+ when Array
178
+ # Array of lines - process each and detect hard breaks
179
+ process_paragraph_lines(value)
180
+ when Hash
181
+ # Single line - just process it
182
+ if value.key?(:ln)
183
+ text = process_line_content(value[:ln])
184
+ # Check for hard break at end
185
+ if text.end_with?(' ')
186
+ spaces = text.rstrip! || text
187
+ trailing = text.length - spaces.length
188
+ text = text.rstrip
189
+ return [{ ln: text }, { br: ' ' * trailing }]
190
+ end
191
+ end
192
+ process_node(value)
193
+ else
194
+ process_node(value)
195
+ end
196
+ end
197
+
198
+ # Process an array of paragraph lines
199
+ # Detects hard line breaks (lines ending with 2+ spaces)
200
+ def process_paragraph_lines(lines)
201
+ result = []
202
+
203
+ lines.each_with_index do |line, _idx|
204
+ processed = process_node(line)
205
+
206
+ # Check if this is a line with trailing spaces (hard break indicator)
207
+ if processed.is_a?(Hash) && processed.key?(:ln)
208
+ text = processed[:ln].to_s
209
+ # Check for 2+ trailing spaces
210
+ if text.rstrip != text && text.rstrip.length < text.length
211
+ trailing_len = text.length - text.rstrip.length
212
+ if trailing_len >= 2
213
+ # Hard line break detected
214
+ stripped_text = text.rstrip
215
+ result << { ln: stripped_text }
216
+ # Add br element (inline hard break)
217
+ result << { br: ' ' * trailing_len }
218
+ next
219
+ end
220
+ end
221
+ end
222
+
223
+ result << processed
224
+ end
225
+
226
+ # Remove trailing br if last element (hard break at end of paragraph is ignored)
227
+ result.pop if result.last.is_a?(Hash) && result.last.key?(:br)
228
+
229
+ result
230
+ end
231
+
232
+ # Process a text value (Parslet::Slice or String)
233
+ # Only processes escape sequences without changing structure
234
+ def process_text_value(value)
235
+ return value if value.nil?
236
+
237
+ case value
238
+ when Parslet::Slice
239
+ end
240
+ process_escapes(value.to_s)
241
+ end
242
+
243
+ # Process escape sequences in text
244
+ #
245
+ # Converts \# to #, \* to *, etc.
246
+ def process_escapes(text)
247
+ return text if text.nil?
248
+
249
+ # Match backslash followed by any ASCII punctuation character
250
+ text.gsub(%r{\\([!-"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~])}) do
251
+ Regexp.last_match(1)
252
+ end
253
+ end
254
+
255
+ # Parse inline elements in the processed AST
256
+ # This runs the InlineParser on text content
257
+ def process_inlines(ast)
258
+ return ast if ast.nil?
259
+
260
+ inline_parser = InlineParser.new
261
+
262
+ process_inlines_recursive(ast, inline_parser)
263
+ end
264
+
265
+ # Recursively process AST for inline elements
266
+ def process_inlines_recursive(node, inline_parser)
267
+ case node
268
+ when Array
269
+ # Process array, potentially flattening if inline parsing produces arrays
270
+ result = []
271
+ node.each do |item|
272
+ processed = process_inlines_recursive(item, inline_parser)
273
+ if processed.is_a?(Array) && item.is_a?(Hash) && item.key?(:ln)
274
+ # Inline parsing of a line produced multiple elements
275
+ result.concat(processed)
276
+ else
277
+ result << processed
278
+ end
279
+ end
280
+ result
281
+ when Hash
282
+ process_inlines_hash(node, inline_parser)
283
+ else
284
+ node
285
+ end
286
+ end
287
+
288
+ # Process a hash for inline elements
289
+ def process_inlines_hash(hash, inline_parser)
290
+ # Skip inline parsing for code blocks - content is literal
291
+ return { code_block: process_node(hash[:code_block]) } if hash.key?(:code_block)
292
+
293
+ result = {}
294
+
295
+ hash.each do |key, value|
296
+ case key
297
+ when :heading
298
+ # Heading marker - process escape sequences only
299
+ result[key] = process_node(value)
300
+ when :text
301
+ result[key] = if hash.key?(:heading)
302
+ # Heading text - process escape sequences for plain strings,
303
+ # inline parse for structured content (setext headings)
304
+ case value
305
+ when String
306
+ process_node(value)
307
+ else
308
+ process_inlines_recursive(value, inline_parser)
309
+ end
310
+ else
311
+ process_inlines_recursive(value, inline_parser)
312
+ end
313
+ when :ln
314
+ # Parse inline content for lines
315
+ inline_result = parse_inline_content(value, inline_parser)
316
+ if inline_result.is_a?(Hash)
317
+ # Result is a hash - merge it
318
+ if inline_result.key?(:ln)
319
+ # Single line with no emphasis - keep as :ln
320
+ result[key] = inline_result[:ln]
321
+ elsif inline_result.key?(:em)
322
+ # Emphasis found - return as the hash
323
+ return inline_result
324
+ else
325
+ result.merge!(inline_result)
326
+ end
327
+ elsif inline_result.is_a?(Array)
328
+ # Multiple elements from inline parsing
329
+ return inline_result unless inline_result.length == 1 && inline_result.first.is_a?(Hash)
330
+
331
+ result.merge!(inline_result.first)
332
+
333
+ else
334
+ result[key] = inline_result
335
+ end
336
+ when :p
337
+ # Process paragraph content recursively
338
+ result[key] = process_inlines_recursive(value, inline_parser)
339
+ else
340
+ result[key] = process_inlines_recursive(value, inline_parser)
341
+ end
342
+ end
343
+
344
+ result
345
+ end
346
+
347
+ # Parse inline content for a text value
348
+ def parse_inline_content(text, inline_parser)
349
+ return { ln: text } if text.nil? || text.to_s.empty?
350
+
351
+ text_str = text.to_s
352
+
353
+ # First, process HTML markdown attributes (Kramdown extension)
354
+ text_str = process_html_markdown_attr(text_str, inline_parser)
355
+
356
+ # Parse with inline parser
357
+ begin
358
+ parsed = inline_parser.parse(text_str)
359
+ return { ln: text } if parsed.nil? || parsed.empty?
360
+
361
+ # Convert parsed result to expected format
362
+ result = convert_inline_result(parsed)
363
+
364
+ # Check if any emphasis was found
365
+ has_emphasis = contains_emphasis?(result)
366
+
367
+ if has_emphasis
368
+ # Return emphasis wrapped in :ln to match expected structure
369
+ if result.is_a?(Hash) && (result.key?(:em) || result.key?(:strong))
370
+ end
371
+ { ln: result }
372
+ else
373
+ # No emphasis found - join all text content back together
374
+ joined = extract_text_content(result)
375
+ { ln: joined }
376
+ end
377
+ rescue Parslet::ParseFailed
378
+ # If parsing fails, return original text in ln structure
379
+ { ln: text }
380
+ end
381
+ end
382
+
383
+ # Process HTML markdown attribute (Kramdown extension)
384
+ # Handles <tag markdown="X">content</tag> patterns
385
+ # - markdown="0" - escape content (no markdown processing)
386
+ # - markdown="1" or markdown="span" - process inline markdown
387
+ # - markdown="block" - process block markdown
388
+ def process_html_markdown_attr(text, inline_parser)
389
+ return text if text.nil?
390
+
391
+ # Pattern to match HTML tags with markdown attribute
392
+ # Captures: tag name, markdown value, content, closing tag
393
+ pattern = %r{<(#{HTML_TAG_PATTERN})\s+([^>]*?)markdown\s*=\s*["']([^"']+)["']([^>]*)>(.*?)</\1>}im
394
+
395
+ text.gsub(pattern) do |_match|
396
+ tag_name = ::Regexp.last_match(1)
397
+ before_attrs = ::Regexp.last_match(2)
398
+ markdown_value = ::Regexp.last_match(3).downcase
399
+ after_attrs = ::Regexp.last_match(4)
400
+ content = ::Regexp.last_match(5)
401
+
402
+ processed_content = case markdown_value
403
+ when '0'
404
+ # Don't process markdown - escape special characters
405
+ escape_html_content(content)
406
+ when '1', 'span'
407
+ # Process inline markdown
408
+ process_inline_in_html(content, inline_parser)
409
+ when 'block'
410
+ # Process block-level markdown (same as span for inline context)
411
+ process_inline_in_html(content, inline_parser)
412
+ else
413
+ # Unknown value, don't process
414
+ escape_html_content(content)
415
+ end
416
+
417
+ # Reconstruct the tag without the markdown attribute
418
+ attrs = "#{before_attrs.strip} #{after_attrs.strip}"
419
+ attrs = attrs.strip
420
+ attrs = " #{attrs}" unless attrs.empty?
421
+ "<#{tag_name}#{attrs}>#{processed_content}</#{tag_name}>"
422
+ end
423
+ end
424
+
425
+ # HTML tag pattern (common tags that might have markdown attribute)
426
+ HTML_TAG_PATTERN = /\w+/
427
+
428
+ # Process inline markdown inside HTML content
429
+ def process_inline_in_html(content, inline_parser)
430
+ return content if content.nil? || content.empty?
431
+
432
+ # Check for nested HTML tags with markdown attribute first
433
+ content = process_html_markdown_attr(content, inline_parser)
434
+
435
+ # Parse the content as inline markdown
436
+ begin
437
+ parsed = inline_parser.parse(content)
438
+ return content if parsed.nil? || parsed.empty?
439
+
440
+ result = convert_inline_result(parsed)
441
+
442
+ # Convert result back to string representation
443
+ inline_result_to_string(result)
444
+ rescue Parslet::ParseFailed
445
+ content
446
+ end
447
+ end
448
+
449
+ # Convert inline parsing result to string
450
+ def inline_result_to_string(result)
451
+ case result
452
+ when Hash
453
+ if result.key?(:em)
454
+ "<em>#{inline_result_to_string(result[:em])}</em>"
455
+ elsif result.key?(:strong)
456
+ "<strong>#{inline_result_to_string(result[:strong])}</strong>"
457
+ elsif result.key?(:code)
458
+ "<code>#{result[:code]}</code>"
459
+ elsif result.key?(:text)
460
+ result[:text].to_s
461
+ else
462
+ result.values.map { |v| inline_result_to_string(v) }.join
463
+ end
464
+ when Array
465
+ result.map { |item| inline_result_to_string(item) }.join
466
+ else
467
+ result.to_s
468
+ end
469
+ end
470
+
471
+ # Escape HTML content for markdown="0"
472
+ def escape_html_content(content)
473
+ # When markdown="0", we need to preserve the content literally
474
+ # but escape any characters that would be interpreted as markdown
475
+ content
476
+ end
477
+
478
+ # Check if result contains emphasis markers
479
+ def contains_emphasis?(result)
480
+ case result
481
+ when Hash
482
+ result.key?(:em) || result.key?(:strong) ||
483
+ result.values.any? { |v| contains_emphasis?(v) }
484
+ when Array
485
+ result.any? { |item| contains_emphasis?(item) }
486
+ else
487
+ false
488
+ end
489
+ end
490
+
491
+ # Extract all text content from result, joining strings
492
+ def extract_text_content(result)
493
+ case result
494
+ when Hash
495
+ if result.key?(:em)
496
+ extract_text_content(result[:em])
497
+ elsif result.key?(:strong)
498
+ extract_text_content(result[:strong])
499
+ elsif result.key?(:ln)
500
+ result[:ln].to_s
501
+ elsif result.key?(:text)
502
+ result[:text].to_s
503
+ else
504
+ result.values.map { |v| extract_text_content(v) }.join
505
+ end
506
+ when Array
507
+ result.map { |item| extract_text_content(item) }.join
508
+ else
509
+ result.to_s
510
+ end
511
+ end
512
+
513
+ # Convert inline parser result to expected format
514
+ def convert_inline_result(parsed)
515
+ return parsed if parsed.nil?
516
+
517
+ case parsed
518
+ when Array
519
+ if parsed.length == 1
520
+ convert_inline_result(parsed.first)
521
+ else
522
+ parsed.map { |item| convert_inline_result(item) }
523
+ end
524
+ when Hash
525
+ # Convert emphasis markers
526
+ if parsed.key?(:emph)
527
+ { em: convert_inline_result(parsed[:emph]) }
528
+ elsif parsed.key?(:text)
529
+ parsed[:text]
530
+ elsif parsed.key?(:code)
531
+ { code: parsed[:code] }
532
+ else
533
+ parsed.transform_values { |v| convert_inline_result(v) }
534
+ end
535
+ else
536
+ parsed
537
+ end
538
+ end
539
+ end
540
+ end
541
+ end
542
+ end
543
+ end