rng 0.1.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +63 -0
  3. data/.github/workflows/release.yml +8 -3
  4. data/.gitignore +11 -0
  5. data/.rubocop.yml +10 -7
  6. data/.rubocop_todo.yml +229 -23
  7. data/CHANGELOG.md +317 -0
  8. data/CLAUDE.md +139 -0
  9. data/Gemfile +11 -12
  10. data/README.adoc +1538 -11
  11. data/Rakefile +11 -3
  12. data/docs/Gemfile +8 -0
  13. data/docs/_config.yml +23 -0
  14. data/docs/getting-started/index.adoc +75 -0
  15. data/docs/guides/error-handling.adoc +137 -0
  16. data/docs/guides/external-references.adoc +128 -0
  17. data/docs/guides/index.adoc +24 -0
  18. data/docs/guides/parsing-rnc.adoc +141 -0
  19. data/docs/guides/parsing-rng-xml.adoc +81 -0
  20. data/docs/guides/rng-to-rnc.adoc +101 -0
  21. data/docs/guides/validation.adoc +85 -0
  22. data/docs/index.adoc +52 -0
  23. data/docs/reference/api.adoc +126 -0
  24. data/docs/reference/cli.adoc +182 -0
  25. data/docs/understanding/architecture.adoc +58 -0
  26. data/docs/understanding/rng-vs-rnc.adoc +118 -0
  27. data/exe/rng +5 -0
  28. data/lib/rng/any_name.rb +10 -8
  29. data/lib/rng/attribute.rb +28 -26
  30. data/lib/rng/choice.rb +24 -24
  31. data/lib/rng/cli.rb +607 -0
  32. data/lib/rng/data.rb +10 -10
  33. data/lib/rng/datatype_declaration.rb +26 -0
  34. data/lib/rng/define.rb +44 -41
  35. data/lib/rng/div.rb +36 -0
  36. data/lib/rng/documentation.rb +9 -0
  37. data/lib/rng/element.rb +39 -37
  38. data/lib/rng/empty.rb +7 -7
  39. data/lib/rng/except.rb +25 -25
  40. data/lib/rng/external_ref.rb +8 -8
  41. data/lib/rng/external_ref_resolver.rb +602 -0
  42. data/lib/rng/foreign_attribute.rb +26 -0
  43. data/lib/rng/foreign_element.rb +33 -0
  44. data/lib/rng/grammar.rb +14 -12
  45. data/lib/rng/group.rb +26 -24
  46. data/lib/rng/include.rb +5 -6
  47. data/lib/rng/include_processor.rb +461 -0
  48. data/lib/rng/interleave.rb +23 -23
  49. data/lib/rng/list.rb +22 -22
  50. data/lib/rng/mixed.rb +23 -23
  51. data/lib/rng/name.rb +6 -7
  52. data/lib/rng/namespace_declaration.rb +47 -0
  53. data/lib/rng/namespaces.rb +15 -0
  54. data/lib/rng/not_allowed.rb +7 -7
  55. data/lib/rng/ns_name.rb +9 -9
  56. data/lib/rng/one_or_more.rb +23 -23
  57. data/lib/rng/optional.rb +23 -23
  58. data/lib/rng/param.rb +7 -8
  59. data/lib/rng/parent_ref.rb +8 -8
  60. data/lib/rng/parse_tree_processor.rb +695 -0
  61. data/lib/rng/pattern.rb +7 -7
  62. data/lib/rng/ref.rb +8 -8
  63. data/lib/rng/rnc_builder.rb +927 -0
  64. data/lib/rng/rnc_parser.rb +605 -305
  65. data/lib/rng/rnc_to_rng_converter.rb +1408 -0
  66. data/lib/rng/schema_preamble.rb +73 -0
  67. data/lib/rng/schema_validator.rb +1622 -0
  68. data/lib/rng/start.rb +27 -25
  69. data/lib/rng/test_suite_parser.rb +168 -0
  70. data/lib/rng/text.rb +11 -8
  71. data/lib/rng/to_rnc.rb +4 -35
  72. data/lib/rng/value.rb +6 -7
  73. data/lib/rng/version.rb +1 -1
  74. data/lib/rng/zero_or_more.rb +23 -23
  75. data/lib/rng.rb +68 -17
  76. data/rng.gemspec +18 -19
  77. data/scripts/extract_spectest_resources.rb +96 -0
  78. data/spec/fixtures/compacttest.xml +2511 -0
  79. data/spec/fixtures/external/circular_a.rng +7 -0
  80. data/spec/fixtures/external/circular_b.rng +7 -0
  81. data/spec/fixtures/external/circular_main.rng +7 -0
  82. data/spec/fixtures/external/external_ref_lib.rng +7 -0
  83. data/spec/fixtures/external/external_ref_main.rng +7 -0
  84. data/spec/fixtures/external/include_lib.rng +7 -0
  85. data/spec/fixtures/external/include_main.rng +3 -0
  86. data/spec/fixtures/external/nested_chain.rng +6 -0
  87. data/spec/fixtures/external/nested_leaf.rng +7 -0
  88. data/spec/fixtures/external/nested_mid.rng +8 -0
  89. data/spec/fixtures/metanorma/3gpp.rnc +35 -0
  90. data/spec/fixtures/metanorma/3gpp.rng +105 -0
  91. data/spec/fixtures/metanorma/basicdoc.rnc +11 -0
  92. data/spec/fixtures/metanorma/bipm.rnc +148 -0
  93. data/spec/fixtures/metanorma/bipm.rng +376 -0
  94. data/spec/fixtures/metanorma/bsi.rnc +104 -0
  95. data/spec/fixtures/metanorma/bsi.rng +332 -0
  96. data/spec/fixtures/metanorma/csa.rnc +45 -0
  97. data/spec/fixtures/metanorma/csa.rng +131 -0
  98. data/spec/fixtures/metanorma/csd.rnc +43 -0
  99. data/spec/fixtures/metanorma/csd.rng +132 -0
  100. data/spec/fixtures/metanorma/gbstandard.rnc +99 -0
  101. data/spec/fixtures/metanorma/gbstandard.rng +316 -0
  102. data/spec/fixtures/metanorma/iec.rnc +49 -0
  103. data/spec/fixtures/metanorma/iec.rng +193 -0
  104. data/spec/fixtures/metanorma/ietf.rnc +275 -0
  105. data/spec/fixtures/metanorma/ietf.rng +925 -0
  106. data/spec/fixtures/metanorma/iho.rnc +58 -0
  107. data/spec/fixtures/metanorma/iho.rng +179 -0
  108. data/spec/fixtures/metanorma/isodoc.rnc +873 -0
  109. data/spec/fixtures/metanorma/isodoc.rng +2704 -0
  110. data/spec/fixtures/metanorma/isostandard-amd.rnc +43 -0
  111. data/spec/fixtures/metanorma/isostandard-amd.rng +108 -0
  112. data/spec/fixtures/metanorma/isostandard.rnc +166 -0
  113. data/spec/fixtures/metanorma/isostandard.rng +494 -0
  114. data/spec/fixtures/metanorma/itu.rnc +122 -0
  115. data/spec/fixtures/metanorma/itu.rng +377 -0
  116. data/spec/fixtures/metanorma/m3d.rnc +41 -0
  117. data/spec/fixtures/metanorma/m3d.rng +122 -0
  118. data/spec/fixtures/metanorma/mpfd.rnc +36 -0
  119. data/spec/fixtures/metanorma/mpfd.rng +95 -0
  120. data/spec/fixtures/metanorma/nist.rnc +77 -0
  121. data/spec/fixtures/metanorma/nist.rng +216 -0
  122. data/spec/fixtures/metanorma/ogc.rnc +51 -0
  123. data/spec/fixtures/metanorma/ogc.rng +151 -0
  124. data/spec/fixtures/metanorma/reqt.rnc +6 -0
  125. data/spec/fixtures/metanorma/rsd.rnc +36 -0
  126. data/spec/fixtures/metanorma/rsd.rng +95 -0
  127. data/spec/fixtures/metanorma/un.rnc +103 -0
  128. data/spec/fixtures/metanorma/un.rng +367 -0
  129. data/spec/fixtures/rnc/base.rnc +4 -0
  130. data/spec/fixtures/rnc/grammar_with_trailing.rnc +8 -0
  131. data/spec/fixtures/rnc/main_include_trailing.rnc +3 -0
  132. data/spec/fixtures/rnc/main_with_include.rnc +5 -0
  133. data/spec/fixtures/rnc/test_augment.rnc +10 -0
  134. data/spec/fixtures/rnc/test_isodoc_simple.rnc +9 -0
  135. data/spec/fixtures/rnc/top_level_include.rnc +8 -0
  136. data/spec/fixtures/spectest_external/case_10_4.7/x +3 -0
  137. data/spec/fixtures/spectest_external/case_10_4.7/y +7 -0
  138. data/spec/fixtures/spectest_external/case_11_4.7/x +3 -0
  139. data/spec/fixtures/spectest_external/case_12_4.7/x +3 -0
  140. data/spec/fixtures/spectest_external/case_13_4.7/x +3 -0
  141. data/spec/fixtures/spectest_external/case_13_4.7/y +3 -0
  142. data/spec/fixtures/spectest_external/case_14_4.7/x +7 -0
  143. data/spec/fixtures/spectest_external/case_15_4.7/x +7 -0
  144. data/spec/fixtures/spectest_external/case_16_4.7/x +5 -0
  145. data/spec/fixtures/spectest_external/case_17_4.7/x +5 -0
  146. data/spec/fixtures/spectest_external/case_18_4.7/x +7 -0
  147. data/spec/fixtures/spectest_external/case_19_4.7/level1.rng +9 -0
  148. data/spec/fixtures/spectest_external/case_19_4.7/level2.rng +7 -0
  149. data/spec/fixtures/spectest_external/case_1_4.5/sub1/x +3 -0
  150. data/spec/fixtures/spectest_external/case_1_4.5/sub3/x +3 -0
  151. data/spec/fixtures/spectest_external/case_1_4.5/x +3 -0
  152. data/spec/fixtures/spectest_external/case_20_4.6/x +3 -0
  153. data/spec/fixtures/spectest_external/case_2_4.5/x +3 -0
  154. data/spec/fixtures/spectest_external/case_3_4.6/x +3 -0
  155. data/spec/fixtures/spectest_external/case_4_4.6/x +3 -0
  156. data/spec/fixtures/spectest_external/case_5_4.6/x +1 -0
  157. data/spec/fixtures/spectest_external/case_6_4.6/x +5 -0
  158. data/spec/fixtures/spectest_external/case_7_4.6/x +1 -0
  159. data/spec/fixtures/spectest_external/case_7_4.6/y +1 -0
  160. data/spec/fixtures/spectest_external/case_8_4.7/x +7 -0
  161. data/spec/fixtures/spectest_external/case_9_4.7/x +7 -0
  162. data/spec/fixtures/spectest_external/resources.json +149 -0
  163. data/spec/rng/advanced_rnc_spec.rb +101 -0
  164. data/spec/rng/compacttest_spec.rb +197 -0
  165. data/spec/rng/datatype_declaration_spec.rb +28 -0
  166. data/spec/rng/div_spec.rb +207 -0
  167. data/spec/rng/external_ref_resolver_spec.rb +122 -0
  168. data/spec/rng/metanorma_conversion_spec.rb +159 -0
  169. data/spec/rng/namespace_declaration_spec.rb +60 -0
  170. data/spec/rng/namespace_support_spec.rb +199 -0
  171. data/spec/rng/rnc_parser_spec.rb +498 -22
  172. data/spec/rng/rnc_roundtrip_spec.rb +96 -82
  173. data/spec/rng/rng_generation_spec.rb +288 -0
  174. data/spec/rng/roundtrip_spec.rb +342 -0
  175. data/spec/rng/schema_preamble_spec.rb +145 -0
  176. data/spec/rng/schema_spec.rb +68 -64
  177. data/spec/rng/spectest_spec.rb +168 -90
  178. data/spec/rng_spec.rb +2 -2
  179. data/spec/spec_helper.rb +7 -42
  180. metadata +141 -8
@@ -1,393 +1,693 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "parslet"
4
- require "nokogiri"
5
- require_relative "grammar"
3
+ require 'parslet'
4
+ require 'nokogiri'
5
+ require 'set'
6
6
 
7
7
  module Rng
8
8
  class RncParser < Parslet::Parser
9
+ # Helper method to extract clean string without Parslet position markers
10
+ def self.extract_string(obj)
11
+ if obj.respond_to?(:str)
12
+ # Parslet::Slice - use .str to get clean string
13
+ obj.str
14
+ elsif obj.is_a?(String)
15
+ obj
16
+ else
17
+ obj.to_s
18
+ end
19
+ end
20
+
21
+ # Comments
22
+ # Regular comment: single #
23
+ rule(:comment) { str('#') >> str('#').absent? >> match('[^\n]').repeat >> (str("\n") | any.absent?) }
24
+ rule(:comment?) { comment.maybe }
25
+
26
+ # Documentation comment: ##
27
+ rule(:doc_comment) { str('##') >> match('[^\n]').repeat.as(:doc_line) >> (str("\n") | any.absent?) }
28
+ rule(:doc_comments) { (doc_comment >> (whitespace.maybe >> doc_comment).repeat).as(:documentation) }
29
+
30
+ # Whitespace (including comments)
9
31
  rule(:space) { match('\s').repeat(1) }
10
32
  rule(:space?) { space.maybe }
11
33
  rule(:newline) { (str("\r").maybe >> str("\n")).repeat(1) }
12
34
  rule(:newline?) { newline.maybe }
13
- rule(:whitespace) { (space | newline).repeat }
14
- rule(:comma) { str(",") }
35
+ # Only regular comments in whitespace - doc comments are captured by pattern rules
36
+ rule(:whitespace) { (space | newline | comment).repeat }
37
+ rule(:comma) { str(',') }
15
38
  rule(:comma?) { (whitespace >> comma >> whitespace).maybe }
16
39
 
17
- rule(:identifier) { match("[a-zA-Z0-9_]").repeat(1).as(:identifier) }
18
- rule(:namespace_prefix) { identifier.as(:prefix) >> str(":") }
40
+ # Escape sequences support
41
+ # Unicode code point: \x{HHHHHH} (1-6 hex digits)
42
+ rule(:hex_escape) do
43
+ str('\\x{') >> match('[0-9a-fA-F]').repeat(1, 6).as(:hex) >> str('}')
44
+ end
45
+
46
+ # Match a keyword that may contain hex escapes
47
+ # Hex escapes are resolved in pre-processing, so keywords match literally here
48
+ # But we still need to handle the case where pre-processing didn't happen
49
+ def keyword(kw)
50
+ str(kw)
51
+ end
52
+
53
+ # Character escapes for strings: \", \\, \n, \r, \t, and RELAX NG class escapes: \i, \c, \d, \w
54
+ rule(:char_escape) do
55
+ str('\\') >> match('["\\\\ntricdw]').as(:char)
56
+ end
57
+
58
+ # Identifier can contain regular chars, dots, hex escapes, or backslash escapes
59
+ rule(:identifier_char) do
60
+ hex_escape.as(:hex_escape) |
61
+ (str('\\') >> str('\\').as(:escaped_backslash)).as(:backslash_escape) |
62
+ (str('\\') >> (match('[a-zA-Z0-9_.-]').as(:escaped_char) | match('[a-zA-Z]').as(:escaped_keyword))).as(:backslash_escape) |
63
+ match('[a-zA-Z0-9_.-]').as(:char)
64
+ end
65
+
66
+ rule(:identifier) { identifier_char.repeat(1).as(:identifier_parts) }
67
+ rule(:namespace_prefix) { identifier.as(:prefix) >> str(':') }
19
68
  rule(:namespace_prefix?) { namespace_prefix.maybe }
20
69
  rule(:qualified_name) { namespace_prefix? >> identifier.as(:local_name) }
21
70
 
22
- rule(:datatype_library) { str("datatypes") >> space >> identifier.as(:prefix) >> space >> string_literal.as(:uri) }
71
+ # Name wildcards for anyName and nsName patterns
23
72
 
24
- rule(:string_literal) { str('"') >> match('[^"]').repeat.as(:string) >> str('"') }
73
+ # anyName wildcard: * or * - exceptName
74
+ rule(:any_name_pattern) do
75
+ str('*') >>
76
+ (space >> str('-') >> space >> name_class_except).maybe.as(:except)
77
+ end
25
78
 
26
- rule(:element_def) do
27
- str("element") >> space >>
28
- qualified_name.as(:name) >>
29
- whitespace >>
30
- str("{") >>
31
- whitespace >>
32
- content.maybe.as(:content) >>
33
- whitespace >>
34
- str("}") >>
35
- (str("*") | str("+") | str("?")).maybe.as(:occurrence)
79
+ # nsName wildcard: prefix:* or prefix:* - exceptName
80
+ rule(:ns_name_pattern) do
81
+ namespace_prefix >> str('*') >>
82
+ (space >> str('-') >> space >> name_class_except).maybe.as(:except)
36
83
  end
37
84
 
38
- rule(:attribute_def) do
39
- str("attribute") >> space >>
40
- qualified_name.as(:name) >>
41
- whitespace >>
42
- str("{") >>
43
- whitespace >>
44
- (datatype_ref | str("text")).as(:type) >>
45
- whitespace >>
46
- str("}")
85
+ # Except clause can be a single name or multiple names in parentheses
86
+ rule(:name_class_except) do
87
+ (str('(') >> space? >> name_class >>
88
+ (space? >> str('|') >> space? >> name_class).repeat >>
89
+ space? >> str(')')) |
90
+ name_class
47
91
  end
48
92
 
49
- rule(:datatype_ref) do
50
- identifier.as(:prefix) >> str(":") >> identifier.as(:type)
93
+ # Name_class rule is useful for EBNF generation of a name_class.
94
+ # It can parse a qualified name or the anyName/namespaceRef/externalRef patterns.
95
+ # !!!!!!!!!
96
+ # GENERAL RULE WALKING TO ANY BYTES WILL CONSUME FROM INPUT; ALL RULER CALLS (HIERARCHY) SHOULD FINALIZE
97
+ # OTHERWISE THE BACKPROP TO DISQUALIFY THE PATTERN WONT WORK.
98
+ # !!!!!!!!!
99
+ # Try wildcards first (more specific), then fall back to qualified names
100
+ rule(:name_class) do
101
+ ns_name_pattern.as(:ns_name) |
102
+ any_name_pattern.as(:any_name) |
103
+ (qualified_name >> (space? >> str('|') >> space? >> qualified_name).repeat(1).as(:name_choice_items)).as(:name_choice) |
104
+ qualified_name.as(:name)
51
105
  end
52
106
 
53
- rule(:text_def) { str("text").as(:text) }
54
- rule(:empty_def) { str("empty").as(:empty) }
107
+ # Datatype library declaration (same as datatype_library but different name for clarity)
108
+ rule(:datatype_decl) do
109
+ keyword('datatypes') >> space >>
110
+ identifier.as(:prefix) >> space? >>
111
+ str('=') >> space? >>
112
+ string_literal.as(:uri)
113
+ end
55
114
 
56
- rule(:group_def) do
57
- str("(") >>
58
- whitespace >>
59
- content.as(:group) >>
60
- whitespace >>
61
- str(")") >>
62
- (str("*") | str("+") | str("?")).maybe.as(:occurrence)
115
+ # String literal with optional concatenation using ~ operator
116
+ # Supports escape sequences: \x{HEX}, \", \\, \n, \r, \t
117
+ # Control characters (0x00-0x1F, 0x7F) must be escaped
118
+ rule(:string_char) do
119
+ hex_escape.as(:hex_escape) |
120
+ char_escape.as(:char_escape) |
121
+ (str('\\').absent? >> str('"').absent? >>
122
+ match('[\u0000-\u001F\u007F]').absent? >> any).as(:char)
63
123
  end
64
124
 
65
- rule(:choice_def) do
66
- content_item.as(:first) >>
67
- (whitespace >> str("|") >> whitespace >> content_item.as(:second)).repeat(1).as(:rest)
125
+ # String chars for single-quote strings (same escapes, different delimiter)
126
+ rule(:single_string_char) do
127
+ hex_escape.as(:hex_escape) |
128
+ char_escape.as(:char_escape) |
129
+ (str('\\').absent? >> str("'").absent? >>
130
+ match('[\u0000-\u001F\u007F]').absent? >> any).as(:char)
68
131
  end
69
132
 
70
- rule(:named_pattern) do
71
- identifier.as(:name) >> whitespace >> str("=") >> whitespace >> content_item.as(:pattern)
133
+ rule(:string_literal) do
134
+ # Multi-line strings: """...""" (can span multiple lines)
135
+ # Content is any char except """
136
+ # Use a helper: char is content if """ is NOT at this position
137
+ multi_line_double = str('"""') >>
138
+ (str('"""').absent? >> any).repeat.as(:multi_line_parts) >>
139
+ str('"""')
140
+ # Multi-line strings: '''...''' (can span multiple lines)
141
+ multi_line_single = str("'''") >>
142
+ (str("'''").absent? >> any).repeat.as(:multi_line_parts) >>
143
+ str("'''")
144
+ # Single-line double-quote strings with concatenation: "..." ~ "..."
145
+ double_string = str('"') >> string_char.repeat.as(:string_parts) >> str('"')
146
+ # Single-line single-quote strings with concatenation: '...' ~ '...'
147
+ single_string = str("'") >> single_string_char.repeat.as(:string_parts) >> str("'")
148
+ concat_part = whitespace >> str('~') >> whitespace >>
149
+ str('"') >> string_char.repeat.as(:concat_string_parts) >> str('"')
150
+ single_concat_part = whitespace >> str('~') >> whitespace >>
151
+ str("'") >> single_string_char.repeat.as(:concat_string_parts) >> str("'")
152
+
153
+ multi_line_concat_double = whitespace >> str('~') >> whitespace >>
154
+ str('"""') >>
155
+ (str('"""').absent? >> any).repeat.as(:concat_multi_line_parts) >>
156
+ str('"""')
157
+ multi_line_concat_single = whitespace >> str('~') >> whitespace >>
158
+ str("'''") >>
159
+ (str("'''").absent? >> any).repeat.as(:concat_multi_line_parts) >>
160
+ str("'''")
161
+
162
+ # Ordered choice: try with concatenation first, then bare multi-line fallback
163
+ (multi_line_double >> (concat_part | single_concat_part | multi_line_concat_double | multi_line_concat_single).repeat.maybe.as(:concatenations)) |
164
+ (multi_line_single >> (concat_part | single_concat_part | multi_line_concat_double | multi_line_concat_single).repeat.maybe.as(:concatenations)) |
165
+ (double_string >> (concat_part | single_concat_part | multi_line_concat_double | multi_line_concat_single).repeat.maybe.as(:concatenations)) |
166
+ (single_string >> (concat_part | single_concat_part | multi_line_concat_double | multi_line_concat_single).repeat.maybe.as(:concatenations))
72
167
  end
73
168
 
74
- rule(:content_item) do
75
- element_def | attribute_def | text_def | empty_def | group_def | choice_def | identifier.as(:ref)
169
+ # Value pattern for literal values
170
+ rule(:value_literal) { string_literal.as(:value) }
171
+
172
+ # Mixed content pattern
173
+ rule(:mixed_pattern) do
174
+ keyword('mixed') >> whitespace >> str('{') >> whitespace >>
175
+ content.as(:mixed_content) >> whitespace >> str('}')
76
176
  end
77
177
 
78
- rule(:content) do
79
- (content_item >> (comma? >> content_item).repeat).as(:items)
178
+ # Namespace declarations
179
+ # Default namespace (unprefixed): default namespace = "uri"
180
+ rule(:default_namespace_decl) do
181
+ keyword('default') >> space >> keyword('namespace') >> space? >>
182
+ str('=') >> space? >> string_literal.as(:uri)
80
183
  end
81
184
 
82
- rule(:start_def) do
83
- str("start") >> whitespace >> str("=") >> whitespace >> content_item.as(:start)
185
+ # Default namespace (prefixed): default namespace prefix = "uri"
186
+ rule(:default_prefixed_namespace_decl) do
187
+ keyword('default') >> space >> keyword('namespace') >> space >>
188
+ identifier.as(:prefix) >> space? >>
189
+ str('=') >> space? >> string_literal.as(:uri)
84
190
  end
85
191
 
86
- rule(:grammar) do
87
- whitespace >>
88
- datatype_library.maybe.as(:datatype_library) >>
89
- whitespace >>
90
- (start_def | named_pattern | element_def).as(:root) >>
91
- (whitespace >> (named_pattern | element_def)).repeat.as(:definitions) >>
192
+ # Prefixed namespace: namespace prefix = "uri"
193
+ rule(:prefixed_namespace_decl) do
194
+ keyword('namespace') >> space >>
195
+ identifier.as(:prefix) >> space? >>
196
+ str('=') >> space? >> string_literal.as(:uri)
197
+ end
198
+
199
+ # Any namespace declaration
200
+ rule(:namespace_decl) do
201
+ default_prefixed_namespace_decl.as(:default_prefixed_ns) |
202
+ default_namespace_decl.as(:default_ns) |
203
+ prefixed_namespace_decl.as(:prefixed_ns)
204
+ end
205
+
206
+ # Annotation element inner content (recursive for nested brackets)
207
+ rule(:annotation_inner_content) do
208
+ (
209
+ # Nested annotation brackets
210
+ (str('[') >> annotation_inner_content >> str(']')) |
211
+ # String literal (don't let brackets inside strings confuse us)
212
+ string_literal |
213
+ # Any char that's not a bracket, quote
214
+ (str('[').absent? >> str(']').absent? >> str('"').absent? >> str("'").absent? >>
215
+ any)
216
+ ).repeat
217
+ end
218
+
219
+ # Annotation attribute: prefix:local = "value" or local = "value"
220
+ rule(:annotation_attr) do
221
+ (((namespace_prefix >> identifier) | identifier).as(:ann_name) >>
222
+ whitespace >> str('=') >> whitespace >>
223
+ string_literal.as(:attr_value)).as(:ann_attr)
224
+ end
225
+
226
+ # Annotation element: prefix:local [ content ] or local [ content ]
227
+ rule(:annotation_elem) do
228
+ (((namespace_prefix >> identifier) | identifier).as(:elem_name) >>
229
+ whitespace >> str('[') >> whitespace >>
230
+ annotation_inner_content.as(:inner_content) >> whitespace >>
231
+ str(']')).as(:ann_elem)
232
+ end
233
+
234
+ # A single annotation item (attribute or element)
235
+ rule(:annotation_item) do
236
+ annotation_elem | annotation_attr
237
+ end
238
+
239
+ # Annotation content: sequence of annotation items OR empty OR raw content (comments, etc.)
240
+ # Raw content matches any character that is NOT a bracket or quote
241
+ rule(:annotation_content) do
242
+ (annotation_item >> (whitespace >> annotation_item).repeat >> whitespace).as(:ann_items) |
243
+ (str('[').absent? >> str(']').absent? >> str('"').absent? >> str("'").absent? >> any).repeat.as(:raw_content) |
92
244
  whitespace
93
245
  end
94
246
 
95
- root(:grammar)
247
+ # Single annotation: [ content ] where content can contain nested brackets, strings, etc.
248
+ # Appears before patterns, definitions, and within annotation elements
249
+ # Handles both empty [] and content-bearing [x = "y"] annotations
250
+ rule(:annotation) do
251
+ str('[') >> whitespace >>
252
+ (
253
+ (annotation_content >> whitespace >> str(']')).as(:ann) |
254
+ str(']').as(:ann)
255
+ )
256
+ end
96
257
 
97
- def self.parse(input)
98
- parser = new
99
- tree = parser.parse(input.strip)
100
- convert_to_rng(tree)
258
+ # One or more annotations preceding a pattern
259
+ rule(:annotations) do
260
+ (whitespace >> annotation).repeat(1)
101
261
  end
102
262
 
103
- def self.to_rnc(schema)
104
- # Convert RNG schema to RNC
105
- builder = RncBuilder.new
106
- builder.build(schema)
263
+ # Notation/annotation: [ key = "value" ] or just [ ... ]
264
+ # Notations are only valid when attached to patterns using >>, not as standalone preamble items
265
+ rule(:notation) do
266
+ annotation
107
267
  end
108
268
 
109
- def self.convert_to_rng(tree)
110
- builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
111
- if tree[:root].key?(:start)
112
- # This is a grammar with named patterns
113
- xml.grammar(xmlns: "http://relaxng.org/ns/structure/1.0") do
114
- # Add datatype library if present
115
- xml.datatypeLibrary tree[:datatype_library][:uri][:string].to_s if tree[:datatype_library]
116
-
117
- # Process start pattern
118
- xml.start do
119
- process_content_item(xml, tree[:root][:start])
120
- end
269
+ rule(:element_def) do
270
+ (doc_comments >> whitespace).maybe.as(:docs) >>
271
+ annotations.maybe.as(:annotations) >>
272
+ keyword('element') >> whitespace >>
273
+ name_class.as(:name) >>
274
+ whitespace >>
275
+ str('{') >> whitespace >>
276
+ content.maybe.as(:content) >> whitespace >>
277
+ str('}') >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)
278
+ end
121
279
 
122
- # Process named patterns
123
- tree[:definitions]&.each do |def_item|
124
- next unless def_item.key?(:name)
280
+ rule(:attribute_def) do
281
+ (doc_comments >> whitespace).maybe.as(:docs) >>
282
+ annotations.maybe.as(:annotations) >>
283
+ keyword('attribute') >> whitespace >>
284
+ name_class.as(:name) >>
285
+ whitespace >>
286
+ str('{') >>
287
+ whitespace >>
288
+ attribute_content.as(:type) >>
289
+ whitespace >>
290
+ str('}') >>
291
+ (str('*') | str('+') | str('?')).maybe.as(:occurrence)
292
+ end
125
293
 
126
- xml.define(name: def_item[:name][:identifier].to_s) do
127
- process_content_item(xml, def_item[:pattern])
128
- end
129
- end
130
- end
131
- else
132
- # This is a simple element pattern
133
- process_content_item(xml, tree[:root])
134
- end
135
- end
294
+ # Attribute content can be: parenthesized choice, datatype_ref, text, value literal, or choice of values
295
+ rule(:attribute_content) do
296
+ # Parenthesized choice: ( "a" | "b" | "c" ) or ( ref1 | ref2 )
297
+ (str('(') >> whitespace >>
298
+ (value_literal | identifier.as(:ref)) >>
299
+ (whitespace >> str('|') >> whitespace >> (value_literal | identifier.as(:ref))).repeat(1) >>
300
+ whitespace >> str(')')).as(:paren_choice) |
301
+ # Datatype except: prefix:type - ( "a" | "b" ) or type - ( "a" | "b" )
302
+ (identifier.as(:datatype_prefix) >> str(':') >> identifier.as(:datatype_type) >>
303
+ whitespace >> str('-') >> whitespace >>
304
+ str('(') >> whitespace >>
305
+ value_literal >>
306
+ (whitespace >> str('|') >> whitespace >> value_literal).repeat(1) >>
307
+ whitespace >> str(')')).as(:datatype_except) |
308
+ (identifier.as(:datatype_type) >>
309
+ whitespace >> str('-') >> whitespace >>
310
+ str('(') >> whitespace >>
311
+ value_literal >>
312
+ (whitespace >> str('|') >> whitespace >> value_literal).repeat(1) >>
313
+ whitespace >> str(')')).as(:datatype_except) |
314
+ # Non-parenthesized choice of value literals: "a" | "b" | "c"
315
+ (value_literal >> (whitespace >> str('|') >> whitespace >> value_literal).repeat(1).as(:value_choice)) |
316
+ value_literal |
317
+ datatype_ref |
318
+ keyword('text').as(:text_type) |
319
+ identifier.as(:ref)
320
+ end
136
321
 
137
- builder.to_xml
322
+ rule(:datatype_ref) do
323
+ identifier.as(:prefix) >> str(':') >> identifier.as(:type) >>
324
+ (whitespace >> str('{') >> whitespace >>
325
+ param_list.as(:params) >> whitespace >> str('}')).maybe
138
326
  end
139
327
 
140
- def self.process_content_item(xml, item)
141
- if item.key?(:name)
142
- # Element definition
143
- attrs = {}
144
- attrs[:name] = item[:name][:local_name][:identifier].to_s
328
+ # Parameter list for datatypes (e.g., pattern = "value", minLength = "1")
329
+ rule(:param_list) do
330
+ param_item >> (whitespace >> param_item).repeat
331
+ end
145
332
 
146
- attrs[:ns] = item[:name][:prefix][:identifier].to_s if item[:name][:prefix]
333
+ # Single parameter (e.g., pattern = "value")
334
+ rule(:param_item) do
335
+ identifier.as(:param_name) >> whitespace >> str('=') >> whitespace >>
336
+ string_literal.as(:param_value)
337
+ end
147
338
 
148
- xml.element(attrs) do
149
- if item[:content]
150
- item[:content][:items].each do |content_item|
151
- process_content_item(xml, content_item)
152
- end
153
- end
154
- end
339
+ # Word boundary - ensure keywords are not followed by identifier characters
340
+ # This prevents "text" from matching "textarea", etc.
341
+ rule(:word_boundary) { match('[a-zA-Z0-9_-]').absent? }
155
342
 
156
- # Handle occurrence
157
- if item[:occurrence]
158
- case item[:occurrence].to_s
159
- when "*"
160
- xml.parent.name = "zeroOrMore"
161
- when "+"
162
- xml.parent.name = "oneOrMore"
163
- when "?"
164
- xml.parent.name = "optional"
165
- end
166
- end
167
- elsif item.key?(:attr_name)
168
- # Attribute definition
169
- attrs = {}
170
- attrs[:name] = item[:attr_name][:local_name][:identifier].to_s
171
-
172
- attrs[:ns] = item[:attr_name][:prefix][:identifier].to_s if item[:attr_name][:prefix]
173
-
174
- xml.attribute(attrs) do
175
- if item[:type] == "text"
176
- xml.text
177
- elsif item[:type].key?(:prefix)
178
- xml.data(type: item[:type][:type][:identifier].to_s,
179
- datatypeLibrary: "http://www.w3.org/2001/XMLSchema-datatypes")
180
- end
181
- end
182
- elsif item.key?(:text)
183
- xml.text
184
- elsif item.key?(:empty)
185
- xml.empty
186
- elsif item.key?(:group)
187
- xml.group do
188
- item[:group][:items].each do |group_item|
189
- process_content_item(xml, group_item)
190
- end
191
- end
343
+ # Keyword patterns with word boundaries
344
+ rule(:text_def) { (keyword('text') >> word_boundary).as(:text) }
345
+ rule(:empty_def) { (keyword('empty') >> word_boundary).as(:empty) }
346
+ rule(:not_allowed_def) { (keyword('notAllowed') >> word_boundary).as(:not_allowed) }
192
347
 
193
- # Handle occurrence
194
- if item[:occurrence]
195
- case item[:occurrence].to_s
196
- when "*"
197
- xml.parent.name = "zeroOrMore"
198
- when "+"
199
- xml.parent.name = "oneOrMore"
200
- when "?"
201
- xml.parent.name = "optional"
202
- end
203
- end
204
- elsif item.key?(:first) && item.key?(:rest)
205
- # Choice definition
206
- xml.choice do
207
- process_content_item(xml, item[:first])
208
- item[:rest].each do |choice_item|
209
- process_content_item(xml, choice_item[:second])
210
- end
211
- end
212
- elsif item.key?(:ref)
213
- # Reference to a named pattern
214
- xml.ref(name: item[:ref][:identifier].to_s)
215
- end
348
+ rule(:list_pattern) do
349
+ keyword('list') >> whitespace >> str('{') >> whitespace >>
350
+ list_content.as(:list_content) >> whitespace >> str('}')
216
351
  end
217
- end
218
352
 
219
- class RncBuilder
220
- def build(schema)
221
- if schema.element
222
- # Simple element pattern
223
- build_element(schema.element)
224
- else
225
- # Grammar with named patterns
226
- result = []
353
+ rule(:parent_ref) do
354
+ keyword('parent') >> whitespace >> identifier.as(:parent_pattern)
355
+ end
227
356
 
228
- # Add datatype library if present
229
- if schema.datatypeLibrary
230
- result << "datatypes xsd = \"#{schema.datatypeLibrary}\""
231
- result << ""
232
- end
357
+ rule(:external_ref) do
358
+ keyword('external') >> space >> string_literal.as(:external_href)
359
+ end
233
360
 
234
- # Process start pattern
235
- if schema.start
236
- result << "start = #{build_pattern(schema.start)}"
237
- result << ""
238
- end
361
+ # List content can be: text, datatype references, or other patterns with occurrence markers
362
+ rule(:list_content_item) do
363
+ (datatype_ref | text_def | identifier.as(:ref)) >>
364
+ (str('*') | str('+') | str('?')).maybe.as(:occurrence)
365
+ end
239
366
 
240
- # Process named patterns
241
- if schema.define && !schema.define.empty?
242
- schema.define.each do |define|
243
- result << "#{define.name} = #{build_pattern(define)}"
244
- result << ""
245
- end
246
- end
367
+ rule(:list_content) do
368
+ list_content_item.as(:first) >>
369
+ (comma? >> list_content_item).repeat.as(:sequence_items).maybe
370
+ end
247
371
 
248
- result.join("\n")
249
- end
372
+ rule(:group_def) do
373
+ str('(') >>
374
+ whitespace >>
375
+ content.as(:group) >>
376
+ whitespace >>
377
+ str(')') >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)
250
378
  end
251
379
 
252
- private
380
+ # Named pattern definition (e.g., "myPattern = element foo { text }")
381
+ rule(:named_pattern) do
382
+ (doc_comments >> whitespace).maybe.as(:docs) >>
383
+ annotations.maybe >>
384
+ identifier.as(:name) >> whitespace >>
385
+ (str('|=') | str('&=') | str('=')).as(:operator) >> whitespace >>
386
+ pattern_list.as(:pattern)
387
+ end
253
388
 
254
- def build_element(element)
255
- result = "element #{element.name} {\n"
256
- result += " #{build_content(element)}\n"
257
- result += "}"
258
- result
389
+ # Start pattern definition
390
+ rule(:start_def) do
391
+ (doc_comments >> whitespace).maybe.as(:docs) >>
392
+ annotations.maybe >>
393
+ keyword('start') >> whitespace >>
394
+ (str('|=') | str('&=') | str('=')).as(:operator) >> whitespace >>
395
+ pattern_list.as(:start_pattern)
259
396
  end
260
397
 
261
- def build_content(node)
262
- content_parts = []
398
+ # Pattern list - similar to content but without being wrapped in element/attribute
399
+ rule(:pattern_list) do
400
+ content_item.as(:first) >>
401
+ (
402
+ (whitespace >> str('&') >> whitespace >> content_item).repeat(1).as(:interleave_items) |
403
+ (whitespace >> str('|') >> whitespace >> content_item).repeat(1).as(:choice_items) |
404
+ (comma? >> content_item).repeat(1).as(:sequence_items)
405
+ ).maybe
406
+ end
263
407
 
264
- # Process attributes
265
- if node.attribute
266
- if node.attribute.is_a?(Array)
267
- node.attribute.each do |attr|
268
- content_parts << build_attribute(attr)
269
- end
270
- else
271
- content_parts << build_attribute(node.attribute)
272
- end
273
- end
408
+ # Choice is handled at content level, not as separate pattern
409
+ rule(:content_item) do
410
+ annotations.maybe >>
411
+ (element_def | attribute_def |
412
+ # Datatype subtraction: identifier - ( value|identifier|choice|annotated )
413
+ (identifier.as(:datatype_name) >>
414
+ whitespace >> str('-') >> whitespace >>
415
+ str('(') >> whitespace >>
416
+ datatype_except_value >>
417
+ (whitespace >> str('|') >> whitespace >> datatype_except_value).repeat.as(:more_except) >>
418
+ whitespace >> str(')')).as(:datatype_subtraction) |
419
+ (text_def >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)) |
420
+ (empty_def >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)) |
421
+ (not_allowed_def >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)) |
422
+ list_pattern | parent_ref | external_ref | group_def | mixed_pattern |
423
+ grammar_block.as(:grammar_block) |
424
+ (value_literal >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)) |
425
+ (datatype_ref >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)) |
426
+ (identifier.as(:ref) >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)))
427
+ end
274
428
 
275
- # Process child elements
276
- if node.element
277
- if node.element.is_a?(Array)
278
- node.element.each do |elem|
279
- content_parts << build_element(elem)
280
- end
281
- else
282
- content_parts << build_element(node.element)
283
- end
284
- end
429
+ # Value that can appear in a datatype except clause
430
+ # Includes string literals, identifiers (for datatype names), annotated values,
431
+ # and parenthesized content (for nested groups with annotations)
432
+ rule(:datatype_except_value) do
433
+ # Annotated parenthesized content: group_def >> annotation
434
+ (group_def >>
435
+ whitespace >> str('>>') >> whitespace >>
436
+ (foreign_element | annotation).as(:annotation)).as(:annotated_except_value) |
437
+ # Annotated value: value_literal >> identifier [] or value_literal >> [ ... ]
438
+ ((value_literal | identifier.as(:datatype_name)) >>
439
+ whitespace >> str('>>') >> whitespace >>
440
+ (foreign_element | annotation).as(:annotation)).as(:annotated_except_value) |
441
+ # Regular parenthesized content (without annotation)
442
+ group_def |
443
+ # Regular value literal or identifier
444
+ value_literal |
445
+ identifier.as(:datatype_name)
446
+ end
285
447
 
286
- # Process text
287
- content_parts << "text" if node.text
448
+ # Content can be interleaved with &, a sequence with commas, or alternatives with |
449
+ rule(:content) do
450
+ content_item.as(:first) >>
451
+ (
452
+ # Annotation attachment: pattern >> identifier [] or pattern >> [ content ]
453
+ (whitespace >> str('>>') >> whitespace >> (foreign_element | annotation).as(:annotation_attached)).repeat(1).as(:annotation_chain) |
454
+ (whitespace >> str('&') >> whitespace >> content_item).repeat(1).as(:interleave_items) |
455
+ (whitespace >> str('|') >> whitespace >> content_item).repeat(1).as(:choice_items) |
456
+ (comma? >> content_item).repeat(1).as(:sequence_items)
457
+ ).maybe
458
+ end
288
459
 
289
- # Process empty
290
- content_parts << "empty" if node.empty
460
+ # Parse balanced braces content - matches everything inside {} including nested {}
461
+ rule(:balanced_braces) do
462
+ (
463
+ (str('{') >> balanced_braces >> str('}')) |
464
+ (str('{').absent? >> str('}').absent? >> any)
465
+ ).repeat
466
+ end
291
467
 
292
- # Process choice
293
- if node.choice
294
- choice_parts = []
295
- if node.choice.is_a?(Array)
296
- node.choice.each do |choice|
297
- choice_parts << build_pattern(choice)
298
- end
299
- else
300
- choice_parts << build_pattern(node.choice)
301
- end
302
- content_parts << choice_parts.join(" | ")
303
- end
468
+ # Include directive - capture override as raw text to avoid backtracking
469
+ # Will be parsed with proper scoping in post-processing
470
+ rule(:include_directive) do
471
+ keyword('include') >> space >> string_literal.as(:href) >> whitespace >>
472
+ (str('{') >> whitespace >>
473
+ balanced_braces.as(:raw_override) >>
474
+ whitespace >> str('}')).maybe.as(:override)
475
+ end
304
476
 
305
- # Process group
306
- if node.group
307
- group_parts = []
308
- if node.group.is_a?(Array)
309
- node.group.each do |group|
310
- group_parts << build_pattern(group)
311
- end
312
- else
313
- group_parts << build_pattern(node.group)
314
- end
315
- content_parts << "(#{group_parts.join(", ")})"
316
- end
477
+ # Include directive - legacy layout with start_def first
478
+ rule(:include_directive_legacy) do
479
+ keyword('include') >> space >> string_literal.as(:href) >> whitespace >>
480
+ start_def.maybe.as(:start) >> whitespace >>
481
+ (named_pattern | element_def.as(:top_element)).repeat.as(:definitions)
482
+ end
317
483
 
318
- # Process ref
319
- if node.ref
320
- if node.ref.is_a?(Array)
321
- node.ref.each do |ref|
322
- content_parts << ref.name
323
- end
324
- else
325
- content_parts << node.ref.name
326
- end
327
- end
484
+ # Foreign element at grammar/div level: name [annotation-content]
485
+ # e.g., foo [] or rng:foo [ "val" ] or foo [ bar [ "baz" ] ]
486
+ # These are annotation elements that appear as standalone items
487
+ rule(:foreign_element) do
488
+ ((namespace_prefix >> identifier) | identifier).as(:foreign_name) >>
489
+ whitespace >> annotation.as(:foreign_annotation)
490
+ end
328
491
 
329
- # Process zeroOrMore
330
- content_parts << "#{build_pattern(node.zeroOrMore)}*" if node.zeroOrMore
492
+ # Div block for documentation and grouping
493
+ rule(:div_block) do
494
+ keyword('div') >> whitespace >> str('{') >> whitespace >>
495
+ (start_def.maybe.as(:start) >>
496
+ whitespace >>
497
+ (include_directive >> whitespace).repeat.as(:includes) >>
498
+ ((named_pattern | foreign_element | div_block.as(:nested_div) | element_def.as(:top_element)) >> whitespace).repeat.as(:patterns)) >>
499
+ whitespace >> str('}')
500
+ end
331
501
 
332
- # Process oneOrMore
333
- content_parts << "#{build_pattern(node.oneOrMore)}+" if node.oneOrMore
502
+ # Standalone pattern - like content_item but without element_def/attribute_def
503
+ # These are patterns that can appear at grammar level without being definitions
504
+ rule(:standalone_pattern) do
505
+ text_def | empty_def | not_allowed_def |
506
+ list_pattern | parent_ref | external_ref | group_def | mixed_pattern |
507
+ datatype_ref |
508
+ value_literal |
509
+ (identifier.as(:ref) >> (str('*') | str('+') | str('?')).maybe.as(:occurrence)) |
510
+ (str('*') >> (str('-') >> space >> name_class).maybe.as(:any_name_except)).as(:bare_any_name)
511
+ end
512
+
513
+ # Grammar-level choice: allows element foo { empty } | element bar { empty }
514
+ # at the top level of a grammar
515
+ rule(:grammar_choice) do
516
+ (element_def | standalone_pattern).as(:first) >>
517
+ (whitespace >> str('|') >> whitespace >>
518
+ (element_def | standalone_pattern)).repeat(1).as(:choice_items)
519
+ end
520
+
521
+ # Grammar can have optional datatype library, start, then multiple named patterns and elements
522
+ # Allow standalone patterns (like 'foo', 'text', 'empty', etc.) as a fallback
523
+ # Allow grammar-level choice: element foo { empty } | element bar { empty }
524
+ rule(:grammar) do
525
+ start_def.maybe.as(:start) >>
526
+ whitespace >>
527
+ (include_directive >> whitespace).repeat.as(:includes) >>
528
+ ((named_pattern | foreign_element | div_block.as(:div) | grammar_choice.as(:top_choice) |
529
+ element_def.as(:top_element) |
530
+ standalone_pattern.as(:standalone)) >> whitespace).repeat.as(:patterns)
531
+ end
334
532
 
335
- # Process optional
336
- content_parts << "#{build_pattern(node.optional)}?" if node.optional
533
+ # Grammar block wrapper - capture content as raw text to avoid backtracking
534
+ # Will be parsed with proper scoping in post-processing
535
+ rule(:grammar_block) do
536
+ keyword('grammar') >> whitespace >> str('{') >> whitespace >>
537
+ balanced_braces.as(:raw_grammar) >>
538
+ whitespace >> str('}')
539
+ end
337
540
 
338
- content_parts.join(",\n ")
541
+ # Included file - more flexible than grammar_wrapper
542
+ # Can be:
543
+ # 1. Just a flat grammar (patterns only)
544
+ # 2. Grammar block
545
+ # 3. Grammar block with trailing definitions
546
+ # 4. Preamble + grammar/grammar_block
547
+ # 5. Empty file
548
+ rule(:included_file) do
549
+ whitespace >>
550
+ preamble.maybe >>
551
+ whitespace >>
552
+ (
553
+ # Grammar block with optional trailing definitions
554
+ (grammar_block.as(:inner_grammar) >>
555
+ (whitespace >> (named_pattern | element_def.as(:top_element))).repeat.as(:trailing_definitions)) |
556
+ # Flat grammar (no wrapper)
557
+ grammar |
558
+ # Empty file is also valid
559
+ str('')
560
+ ) >>
561
+ whitespace
339
562
  end
340
563
 
341
- def build_attribute(attr)
342
- result = "attribute #{attr.name} { "
564
+ # Schema preamble - namespace and datatype declarations
565
+ # Annotations [ key = "value" ] are also allowed in preamble for documentation
566
+ rule(:preamble_item) do
567
+ (namespace_decl | datatype_decl | notation) >> whitespace
568
+ end
343
569
 
344
- result += if attr.data
345
- if attr.data.type
346
- "xsd:#{attr.data.type}"
347
- else
348
- "text"
349
- end
350
- else
351
- "text"
352
- end
570
+ rule(:preamble) do
571
+ preamble_item.repeat.as(:preamble_items)
572
+ end
353
573
 
354
- result += " }"
355
- result
574
+ # Root can be a grammar block with optional definitions after, OR plain grammar (for flat RNC files), with optional preamble at top
575
+ root(:grammar_wrapper)
576
+ rule(:grammar_wrapper) do
577
+ whitespace >>
578
+ preamble.maybe >>
579
+ whitespace >>
580
+ (
581
+ # Try in order from most specific to least specific
582
+ # 1. Grammar block (starts with literal "grammar {")
583
+ (grammar_block.as(:inner_grammar) >>
584
+ (whitespace >> (named_pattern | element_def.as(:top_element))).repeat.as(:trailing_definitions)) |
585
+ # 2. Top-level includes (for Metanorma-style schemas) - use raw capture for trailing
586
+ ((include_directive >> whitespace).repeat(1).as(:top_includes) >>
587
+ whitespace >> any.repeat.as(:raw_trailing)) |
588
+ # 3. Flat grammar (default - most flexible) - raw_patterns handled internally
589
+ grammar
590
+ ) >>
591
+ whitespace
592
+ end
593
+
594
+ # Class method to parse a file with include resolution
595
+ def self.parse_file(file_path, base_dir = nil, visited_files = Set.new)
596
+ IncludeProcessor.new.parse_file(file_path, base_dir, visited_files)
356
597
  end
357
598
 
358
- def build_pattern(node)
359
- if node.element
360
- build_element(node.element)
361
- elsif node.ref
362
- node.ref.name
363
- elsif node.choice
364
- choice_parts = []
365
- if node.choice.is_a?(Array)
366
- node.choice.each do |choice|
367
- choice_parts << build_pattern(choice)
599
+ # Pre-process RNC input to resolve hex escapes (\x{HHHHHH})
600
+ # outside of string literals. This allows keywords to contain hex escapes
601
+ # (e.g., \x{65}l\x{00065}ment = "element").
602
+ # String literals keep their hex escapes for the parser to handle,
603
+ # because control characters like \x{A} (newline) are forbidden inside
604
+ # single-line quoted strings and must remain escaped.
605
+ def self.preprocess_hex_escapes(input)
606
+ result = +''
607
+ i = 0
608
+ while i < input.length
609
+ # Triple-quoted strings: copy verbatim
610
+ if input[i, 3] == '"""'
611
+ end_idx = input.index('"""', i + 3)
612
+ end_idx ||= input.length - 3
613
+ result << input[i..(end_idx + 2)]
614
+ i = end_idx + 3
615
+ elsif input[i, 3] == "'''"
616
+ end_idx = input.index("'''", i + 3)
617
+ end_idx ||= input.length - 3
618
+ result << input[i..(end_idx + 2)]
619
+ i = end_idx + 3
620
+ # Single-line double-quoted string: copy verbatim
621
+ elsif input[i] == '"'
622
+ j = i + 1
623
+ while j < input.length && input[j] != '"'
624
+ j += 1 if input[j] == '\\' && j + 1 < input.length # skip escaped char
625
+ j += 1
368
626
  end
369
- else
370
- choice_parts << build_pattern(node.choice)
371
- end
372
- choice_parts.join(" | ")
373
- elsif node.group
374
- group_parts = []
375
- if node.group.is_a?(Array)
376
- node.group.each do |group|
377
- group_parts << build_pattern(group)
627
+ result << input[i..j]
628
+ i = j + 1
629
+ # Single-line single-quoted string: copy verbatim
630
+ elsif input[i] == "'"
631
+ j = i + 1
632
+ while j < input.length && input[j] != "'"
633
+ j += 1 if input[j] == '\\' && j + 1 < input.length # skip escaped char
634
+ j += 1
378
635
  end
636
+ result << input[i..j]
637
+ i = j + 1
638
+ # Comment: copy verbatim to end of line
639
+ elsif input[i] == '#'
640
+ j = input.index("\n", i) || input.length
641
+ result << input[i...j]
642
+ i = j
643
+ # Hex escape outside string: decode it
644
+ elsif input[i] == '\\' && input[i + 1] == 'x' && input[i + 2] == '{'
645
+ end_brace = input.index('}', i + 3)
646
+ if end_brace
647
+ hex = input[(i + 3)...end_brace]
648
+ if hex.match?(/\A[0-9a-fA-F]{1,6}\z/)
649
+ code_point = hex.to_i(16)
650
+ if code_point <= 0x10FFFF && !code_point.between?(0xD800, 0xDFFF) &&
651
+ code_point >= 0x20 # Reject control characters outside strings
652
+ result << [code_point].pack('U')
653
+ i = end_brace + 1
654
+ next
655
+ end
656
+ end
657
+ end
658
+ # Not a valid hex escape, copy as-is
659
+ result << input[i]
660
+ i += 1
379
661
  else
380
- group_parts << build_pattern(node.group)
662
+ result << input[i]
663
+ i += 1
381
664
  end
382
- "(#{group_parts.join(", ")})"
383
- elsif node.text
384
- "text"
385
- elsif node.empty
386
- "empty"
387
- else
388
- # Default case
389
- ""
390
665
  end
666
+ result
667
+ end
668
+
669
+ def self.parse(input)
670
+ parser = new
671
+ preprocessed = preprocess_hex_escapes(input.strip)
672
+ tree = parser.parse(preprocessed)
673
+
674
+ # Normalize parse tree
675
+ processor = ParseTreeProcessor.new(tree)
676
+ normalized = processor.normalize
677
+
678
+ # Convert to RNG XML and Grammar object
679
+ rng_xml = convert_to_rng(normalized.grammar_tree)
680
+ Grammar.from_xml(rng_xml)
681
+ end
682
+
683
+ # Convert RNG schema to RNC
684
+ def self.to_rnc(schema)
685
+ RncBuilder.new.build(schema)
686
+ end
687
+
688
+ # Convert parse tree to RNG XML
689
+ def self.convert_to_rng(tree)
690
+ RncToRngConverter.new.convert(tree)
391
691
  end
392
692
  end
393
693
  end