pubid 1.15.19 → 2.0.0.pre.alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (604) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +1 -1
  3. data/README.adoc +2041 -53
  4. data/archived-gems/pubid-ccsds/update_codes.yaml +1 -0
  5. data/archived-gems/pubid-iec/stages.yaml +129 -0
  6. data/archived-gems/pubid-iec/update_codes.yaml +67 -0
  7. data/archived-gems/pubid-ieee/update_codes.yaml +104 -0
  8. data/archived-gems/pubid-iso/stages.yaml +106 -0
  9. data/archived-gems/pubid-iso/update_codes.yaml +4 -0
  10. data/archived-gems/pubid-itu/i18n.yaml +13 -0
  11. data/archived-gems/pubid-itu/series.yaml +42 -0
  12. data/archived-gems/pubid-nist/publishers.yaml +6 -0
  13. data/archived-gems/pubid-nist/series.yaml +121 -0
  14. data/archived-gems/pubid-nist/stages.yaml +16 -0
  15. data/archived-gems/pubid-nist/update_codes.yaml +93 -0
  16. data/archived-gems/pubid-plateau/update_codes.yaml +6 -0
  17. data/data/ccsds/update_codes.yaml +1 -0
  18. data/data/iec/update_codes.yaml +67 -0
  19. data/data/ieee/update_codes.yaml +104 -0
  20. data/data/iso/update_codes.yaml +21 -0
  21. data/data/nist/update_codes.yaml +89 -0
  22. data/data/plateau/update_codes.yaml +6 -0
  23. data/lib/pubid/amca/builder.rb +176 -0
  24. data/lib/pubid/amca/identifier.rb +57 -0
  25. data/lib/pubid/amca/identifiers/base.rb +64 -0
  26. data/lib/pubid/amca/identifiers/interpretation.rb +51 -0
  27. data/lib/pubid/amca/identifiers/publication.rb +47 -0
  28. data/lib/pubid/amca/identifiers/standard.rb +22 -0
  29. data/lib/pubid/amca/identifiers.rb +12 -0
  30. data/lib/pubid/amca/parser.rb +153 -0
  31. data/lib/pubid/amca/scheme.rb +16 -0
  32. data/lib/pubid/amca/single_identifier.rb +33 -0
  33. data/lib/pubid/amca/urn_generator.rb +50 -0
  34. data/lib/pubid/amca.rb +26 -0
  35. data/lib/pubid/ansi/builder.rb +52 -0
  36. data/lib/pubid/ansi/identifier.rb +55 -0
  37. data/lib/pubid/ansi/identifiers/american_national_standard.rb +12 -0
  38. data/lib/pubid/ansi/identifiers/standard.rb +16 -0
  39. data/lib/pubid/ansi/identifiers.rb +11 -0
  40. data/lib/pubid/ansi/parser.rb +91 -0
  41. data/lib/pubid/ansi/scheme.rb +15 -0
  42. data/lib/pubid/ansi/single_identifier.rb +45 -0
  43. data/lib/pubid/ansi/urn_generator.rb +76 -0
  44. data/lib/pubid/ansi.rb +27 -0
  45. data/lib/pubid/api/builder.rb +85 -0
  46. data/lib/pubid/api/components/code.rb +9 -0
  47. data/lib/pubid/api/identifier.rb +68 -0
  48. data/lib/pubid/api/identifiers/base.rb +24 -0
  49. data/lib/pubid/api/identifiers/bulletin.rb +15 -0
  50. data/lib/pubid/api/identifiers/continuous_operations_standard.rb +15 -0
  51. data/lib/pubid/api/identifiers/mpms.rb +44 -0
  52. data/lib/pubid/api/identifiers/publication.rb +15 -0
  53. data/lib/pubid/api/identifiers/recommended_practice.rb +15 -0
  54. data/lib/pubid/api/identifiers/specification.rb +15 -0
  55. data/lib/pubid/api/identifiers/standard.rb +15 -0
  56. data/lib/pubid/api/identifiers/technical_report.rb +15 -0
  57. data/lib/pubid/api/identifiers/typeless_standard.rb +27 -0
  58. data/lib/pubid/api/parser.rb +140 -0
  59. data/lib/pubid/api/scheme.rb +66 -0
  60. data/lib/pubid/api/single_identifier.rb +46 -0
  61. data/lib/pubid/api/urn_generator.rb +41 -0
  62. data/lib/pubid/api.rb +17 -0
  63. data/lib/pubid/ashrae/builder.rb +498 -0
  64. data/lib/pubid/ashrae/identifier.rb +57 -0
  65. data/lib/pubid/ashrae/identifiers/addenda_package.rb +46 -0
  66. data/lib/pubid/ashrae/identifiers/addendum.rb +55 -0
  67. data/lib/pubid/ashrae/identifiers/base.rb +23 -0
  68. data/lib/pubid/ashrae/identifiers/combined_addenda.rb +51 -0
  69. data/lib/pubid/ashrae/identifiers/errata.rb +40 -0
  70. data/lib/pubid/ashrae/identifiers/guideline.rb +38 -0
  71. data/lib/pubid/ashrae/identifiers/interpretation.rb +39 -0
  72. data/lib/pubid/ashrae/identifiers/standard.rb +38 -0
  73. data/lib/pubid/ashrae/identifiers.rb +16 -0
  74. data/lib/pubid/ashrae/parser.rb +724 -0
  75. data/lib/pubid/ashrae/scheme.rb +53 -0
  76. data/lib/pubid/ashrae/single_identifier.rb +23 -0
  77. data/lib/pubid/ashrae/supplement_identifier.rb +23 -0
  78. data/lib/pubid/ashrae/urn_generator.rb +59 -0
  79. data/lib/pubid/ashrae.rb +21 -0
  80. data/lib/pubid/asme/builder.rb +153 -0
  81. data/lib/pubid/asme/components/code.rb +18 -0
  82. data/lib/pubid/asme/identifier.rb +61 -0
  83. data/lib/pubid/asme/identifiers/base.rb +70 -0
  84. data/lib/pubid/asme/identifiers/standard.rb +12 -0
  85. data/lib/pubid/asme/identifiers.rb +10 -0
  86. data/lib/pubid/asme/parser.rb +308 -0
  87. data/lib/pubid/asme/scheme.rb +37 -0
  88. data/lib/pubid/asme/single_identifier.rb +29 -0
  89. data/lib/pubid/asme/urn_generator.rb +133 -0
  90. data/lib/pubid/asme.rb +21 -0
  91. data/lib/pubid/astm/builder.rb +159 -0
  92. data/lib/pubid/astm/components/code.rb +33 -0
  93. data/lib/pubid/astm/identifier.rb +92 -0
  94. data/lib/pubid/astm/identifiers/adjunct.rb +21 -0
  95. data/lib/pubid/astm/identifiers/base.rb +13 -0
  96. data/lib/pubid/astm/identifiers/data_series.rb +25 -0
  97. data/lib/pubid/astm/identifiers/iso_dual_published.rb +74 -0
  98. data/lib/pubid/astm/identifiers/manual.rb +40 -0
  99. data/lib/pubid/astm/identifiers/monograph.rb +25 -0
  100. data/lib/pubid/astm/identifiers/research_report.rb +18 -0
  101. data/lib/pubid/astm/identifiers/standard.rb +52 -0
  102. data/lib/pubid/astm/identifiers/technical_report.rb +23 -0
  103. data/lib/pubid/astm/identifiers/work_in_progress.rb +21 -0
  104. data/lib/pubid/astm/parser.rb +244 -0
  105. data/lib/pubid/astm/scheme.rb +55 -0
  106. data/lib/pubid/astm/single_identifier.rb +25 -0
  107. data/lib/pubid/astm/urn_generator.rb +99 -0
  108. data/lib/pubid/astm.rb +38 -0
  109. data/lib/pubid/bsi/builder.rb +1483 -0
  110. data/lib/pubid/bsi/components/code.rb +11 -0
  111. data/lib/pubid/bsi/components/date.rb +11 -0
  112. data/lib/pubid/bsi/components/publisher.rb +11 -0
  113. data/lib/pubid/bsi/components/type.rb +11 -0
  114. data/lib/pubid/bsi/identifier.rb +87 -0
  115. data/lib/pubid/bsi/identifiers/addendum_document.rb +64 -0
  116. data/lib/pubid/bsi/identifiers/adopted_european_norm.rb +95 -0
  117. data/lib/pubid/bsi/identifiers/adopted_international_standard.rb +82 -0
  118. data/lib/pubid/bsi/identifiers/aerospace_standard.rb +118 -0
  119. data/lib/pubid/bsi/identifiers/amendment.rb +40 -0
  120. data/lib/pubid/bsi/identifiers/base.rb +11 -0
  121. data/lib/pubid/bsi/identifiers/british_industrial_practice.rb +27 -0
  122. data/lib/pubid/bsi/identifiers/british_standard.rb +33 -0
  123. data/lib/pubid/bsi/identifiers/bundled_identifier.rb +114 -0
  124. data/lib/pubid/bsi/identifiers/committee_document.rb +51 -0
  125. data/lib/pubid/bsi/identifiers/consolidated_identifier.rb +152 -0
  126. data/lib/pubid/bsi/identifiers/corrigendum.rb +28 -0
  127. data/lib/pubid/bsi/identifiers/detailed_specification.rb +69 -0
  128. data/lib/pubid/bsi/identifiers/disc.rb +56 -0
  129. data/lib/pubid/bsi/identifiers/draft_document.rb +71 -0
  130. data/lib/pubid/bsi/identifiers/electronic_book.rb +52 -0
  131. data/lib/pubid/bsi/identifiers/expert_commentary.rb +47 -0
  132. data/lib/pubid/bsi/identifiers/explanatory_supplement.rb +82 -0
  133. data/lib/pubid/bsi/identifiers/flex.rb +61 -0
  134. data/lib/pubid/bsi/identifiers/handbook.rb +39 -0
  135. data/lib/pubid/bsi/identifiers/index.rb +62 -0
  136. data/lib/pubid/bsi/identifiers/method.rb +76 -0
  137. data/lib/pubid/bsi/identifiers/national_annex.rb +73 -0
  138. data/lib/pubid/bsi/identifiers/practice_guide.rb +27 -0
  139. data/lib/pubid/bsi/identifiers/publicly_available_specification.rb +79 -0
  140. data/lib/pubid/bsi/identifiers/published_document.rb +79 -0
  141. data/lib/pubid/bsi/identifiers/section.rb +62 -0
  142. data/lib/pubid/bsi/identifiers/set.rb +46 -0
  143. data/lib/pubid/bsi/identifiers/standalone_amendment.rb +40 -0
  144. data/lib/pubid/bsi/identifiers/supplement_document.rb +51 -0
  145. data/lib/pubid/bsi/identifiers/supplementary_index.rb +81 -0
  146. data/lib/pubid/bsi/identifiers/technical_specification.rb +79 -0
  147. data/lib/pubid/bsi/identifiers/test_method.rb +67 -0
  148. data/lib/pubid/bsi/identifiers/value_added_publication.rb +52 -0
  149. data/lib/pubid/bsi/identifiers.rb +52 -0
  150. data/lib/pubid/bsi/model.rb +196 -0
  151. data/lib/pubid/bsi/parser.rb +659 -0
  152. data/lib/pubid/bsi/scheme.rb +243 -0
  153. data/lib/pubid/bsi/single_identifier.rb +129 -0
  154. data/lib/pubid/bsi/urn_generator.rb +84 -0
  155. data/lib/pubid/bsi.rb +32 -0
  156. data/lib/pubid/builder/base.rb +138 -0
  157. data/lib/pubid/bundled_identifier.rb +126 -0
  158. data/lib/pubid/ccsds/builder.rb +56 -0
  159. data/lib/pubid/ccsds/identifier.rb +84 -0
  160. data/lib/pubid/ccsds/identifiers/base.rb +89 -0
  161. data/lib/pubid/ccsds/identifiers/base_BASE_88929.rb +70 -0
  162. data/lib/pubid/ccsds/identifiers/corrigendum.rb +39 -0
  163. data/lib/pubid/ccsds/identifiers.rb +10 -0
  164. data/lib/pubid/ccsds/parser.rb +71 -0
  165. data/lib/pubid/ccsds/scheme.rb +57 -0
  166. data/lib/pubid/ccsds/single_identifier.rb +77 -0
  167. data/lib/pubid/ccsds/supplement_identifier.rb +33 -0
  168. data/lib/pubid/ccsds/urn_generator.rb +115 -0
  169. data/lib/pubid/ccsds.rb +21 -0
  170. data/lib/pubid/cen_cenelec/builder.rb +330 -0
  171. data/lib/pubid/cen_cenelec/identifier.rb +52 -0
  172. data/lib/pubid/cen_cenelec/identifiers/adopted_european_norm.rb +40 -0
  173. data/lib/pubid/cen_cenelec/identifiers/amendment.rb +29 -0
  174. data/lib/pubid/cen_cenelec/identifiers/base.rb +75 -0
  175. data/lib/pubid/cen_cenelec/identifiers/cen_report.rb +28 -0
  176. data/lib/pubid/cen_cenelec/identifiers/cen_workshop_agreement.rb +27 -0
  177. data/lib/pubid/cen_cenelec/identifiers/cenelec_harmonization_document.rb +28 -0
  178. data/lib/pubid/cen_cenelec/identifiers/consolidated_identifier.rb +61 -0
  179. data/lib/pubid/cen_cenelec/identifiers/corrigendum.rb +35 -0
  180. data/lib/pubid/cen_cenelec/identifiers/european_norm.rb +41 -0
  181. data/lib/pubid/cen_cenelec/identifiers/european_prestandard.rb +37 -0
  182. data/lib/pubid/cen_cenelec/identifiers/european_specification.rb +28 -0
  183. data/lib/pubid/cen_cenelec/identifiers/fragment.rb +22 -0
  184. data/lib/pubid/cen_cenelec/identifiers/guide.rb +27 -0
  185. data/lib/pubid/cen_cenelec/identifiers/harmonization_document.rb +27 -0
  186. data/lib/pubid/cen_cenelec/identifiers/technical_report.rb +27 -0
  187. data/lib/pubid/cen_cenelec/identifiers/technical_specification.rb +35 -0
  188. data/lib/pubid/cen_cenelec/identifiers.rb +32 -0
  189. data/lib/pubid/cen_cenelec/parser.rb +144 -0
  190. data/lib/pubid/cen_cenelec/scheme.rb +164 -0
  191. data/lib/pubid/cen_cenelec/single_identifier.rb +130 -0
  192. data/lib/pubid/cen_cenelec/supplement_identifier.rb +48 -0
  193. data/lib/pubid/cen_cenelec/urn_generator.rb +129 -0
  194. data/lib/pubid/cen_cenelec.rb +21 -0
  195. data/lib/pubid/cie/builder.rb +399 -0
  196. data/lib/pubid/cie/components/code.rb +72 -0
  197. data/lib/pubid/cie/components/language.rb +58 -0
  198. data/lib/pubid/cie/identifier.rb +71 -0
  199. data/lib/pubid/cie/identifiers/bundle.rb +20 -0
  200. data/lib/pubid/cie/identifiers/conference.rb +32 -0
  201. data/lib/pubid/cie/identifiers/corrigendum.rb +40 -0
  202. data/lib/pubid/cie/identifiers/dual_published.rb +41 -0
  203. data/lib/pubid/cie/identifiers/identical.rb +64 -0
  204. data/lib/pubid/cie/identifiers/joint_published.rb +52 -0
  205. data/lib/pubid/cie/identifiers/standard.rb +58 -0
  206. data/lib/pubid/cie/identifiers/supplement.rb +45 -0
  207. data/lib/pubid/cie/identifiers/tutorial_bundle.rb +20 -0
  208. data/lib/pubid/cie/identifiers.rb +17 -0
  209. data/lib/pubid/cie/parser.rb +347 -0
  210. data/lib/pubid/cie/scheme.rb +64 -0
  211. data/lib/pubid/cie/single_identifier.rb +30 -0
  212. data/lib/pubid/cie/supplement_identifier.rb +26 -0
  213. data/lib/pubid/cie/urn_generator.rb +123 -0
  214. data/lib/pubid/cie.rb +28 -0
  215. data/lib/pubid/components/code.rb +33 -0
  216. data/lib/pubid/components/date.rb +49 -0
  217. data/lib/pubid/components/edition.rb +32 -0
  218. data/lib/pubid/components/factory.rb +50 -0
  219. data/lib/pubid/components/language.rb +37 -0
  220. data/lib/pubid/components/locality.rb +10 -0
  221. data/lib/pubid/components/publisher.rb +36 -0
  222. data/lib/pubid/components/stage.rb +54 -0
  223. data/lib/pubid/components/type.rb +58 -0
  224. data/lib/pubid/components/typed_stage.rb +59 -0
  225. data/lib/pubid/components.rb +16 -0
  226. data/lib/pubid/core/pattern_doc_generator.rb +272 -0
  227. data/lib/pubid/core/update_codes.rb +77 -0
  228. data/lib/pubid/core.rb +8 -0
  229. data/lib/pubid/csa/builder.rb +671 -0
  230. data/lib/pubid/csa/components/code.rb +9 -0
  231. data/lib/pubid/csa/components.rb +9 -0
  232. data/lib/pubid/csa/composite_identifier.rb +27 -0
  233. data/lib/pubid/csa/identifier.rb +513 -0
  234. data/lib/pubid/csa/identifiers/base.rb +133 -0
  235. data/lib/pubid/csa/identifiers/bundled.rb +125 -0
  236. data/lib/pubid/csa/identifiers/canadian_adopted.rb +82 -0
  237. data/lib/pubid/csa/identifiers/cec.rb +129 -0
  238. data/lib/pubid/csa/identifiers/combined.rb +130 -0
  239. data/lib/pubid/csa/identifiers/csa_adopted.rb +78 -0
  240. data/lib/pubid/csa/identifiers/package.rb +65 -0
  241. data/lib/pubid/csa/identifiers/series.rb +127 -0
  242. data/lib/pubid/csa/identifiers/standard.rb +10 -0
  243. data/lib/pubid/csa/identifiers.rb +17 -0
  244. data/lib/pubid/csa/parser.rb +445 -0
  245. data/lib/pubid/csa/scheme.rb +44 -0
  246. data/lib/pubid/csa/single_identifier.rb +30 -0
  247. data/lib/pubid/csa/urn_generator.rb +80 -0
  248. data/lib/pubid/csa/wrapper_identifier.rb +31 -0
  249. data/lib/pubid/csa.rb +25 -0
  250. data/lib/pubid/etsi/builder.rb +133 -0
  251. data/lib/pubid/etsi/components/code.rb +42 -0
  252. data/lib/pubid/etsi/components/version.rb +37 -0
  253. data/lib/pubid/etsi/components.rb +10 -0
  254. data/lib/pubid/etsi/identifier.rb +57 -0
  255. data/lib/pubid/etsi/identifiers/amendment.rb +15 -0
  256. data/lib/pubid/etsi/identifiers/base.rb +38 -0
  257. data/lib/pubid/etsi/identifiers/corrigendum.rb +15 -0
  258. data/lib/pubid/etsi/identifiers/etsi_standard.rb +19 -0
  259. data/lib/pubid/etsi/identifiers/supplement_identifier.rb +91 -0
  260. data/lib/pubid/etsi/identifiers.rb +14 -0
  261. data/lib/pubid/etsi/parser.rb +133 -0
  262. data/lib/pubid/etsi/scheme.rb +42 -0
  263. data/lib/pubid/etsi/urn_generator.rb +76 -0
  264. data/lib/pubid/etsi.rb +21 -0
  265. data/lib/pubid/export/auditor.rb +89 -0
  266. data/lib/pubid/export/data_class_exporter.rb +59 -0
  267. data/lib/pubid/export/exporter.rb +74 -0
  268. data/lib/pubid/export/flavor_exporter.rb +402 -0
  269. data/lib/pubid/export/ieee_exporter.rb +78 -0
  270. data/lib/pubid/export/itu_exporter.rb +66 -0
  271. data/lib/pubid/export/nist_exporter.rb +64 -0
  272. data/lib/pubid/export/registry_exporter.rb +90 -0
  273. data/lib/pubid/export/result.rb +97 -0
  274. data/lib/pubid/export/scheme_exporter.rb +70 -0
  275. data/lib/pubid/export.rb +18 -0
  276. data/lib/pubid/format_detector.rb +16 -0
  277. data/lib/pubid/format_registry.rb +42 -0
  278. data/lib/pubid/identifier.rb +242 -0
  279. data/lib/pubid/identifier_metadata.rb +148 -0
  280. data/lib/pubid/identifier_registry.rb +198 -0
  281. data/lib/pubid/idf/builder.rb +82 -0
  282. data/lib/pubid/idf/identifier.rb +129 -0
  283. data/lib/pubid/idf/identifiers/amendment.rb +27 -0
  284. data/lib/pubid/idf/identifiers/corrigendum.rb +27 -0
  285. data/lib/pubid/idf/identifiers/international_standard.rb +123 -0
  286. data/lib/pubid/idf/identifiers/reviewed_method.rb +100 -0
  287. data/lib/pubid/idf/identifiers.rb +13 -0
  288. data/lib/pubid/idf/parser.rb +143 -0
  289. data/lib/pubid/idf/scheme.rb +61 -0
  290. data/lib/pubid/idf/single_identifier.rb +19 -0
  291. data/lib/pubid/idf/supplement_identifier.rb +43 -0
  292. data/lib/pubid/idf/urn_generator.rb +84 -0
  293. data/lib/pubid/idf.rb +25 -0
  294. data/lib/pubid/iec/builder.rb +458 -0
  295. data/lib/pubid/iec/components/code.rb +60 -0
  296. data/lib/pubid/iec/components/consolidated_amendment.rb +59 -0
  297. data/lib/pubid/iec/components/publisher.rb +36 -0
  298. data/lib/pubid/iec/components/sheet.rb +32 -0
  299. data/lib/pubid/iec/components/trf_info.rb +38 -0
  300. data/lib/pubid/iec/components/vap_suffix.rb +41 -0
  301. data/lib/pubid/iec/identifier.rb +256 -0
  302. data/lib/pubid/iec/identifiers/amendment.rb +94 -0
  303. data/lib/pubid/iec/identifiers/base.rb +82 -0
  304. data/lib/pubid/iec/identifiers/component_specification.rb +39 -0
  305. data/lib/pubid/iec/identifiers/conformity_assessment.rb +39 -0
  306. data/lib/pubid/iec/identifiers/consolidated_identifier.rb +82 -0
  307. data/lib/pubid/iec/identifiers/corrigendum.rb +94 -0
  308. data/lib/pubid/iec/identifiers/fragment_identifier.rb +137 -0
  309. data/lib/pubid/iec/identifiers/guide.rb +104 -0
  310. data/lib/pubid/iec/identifiers/international_standard.rb +147 -0
  311. data/lib/pubid/iec/identifiers/interpretation_sheet.rb +104 -0
  312. data/lib/pubid/iec/identifiers/operational_document.rb +39 -0
  313. data/lib/pubid/iec/identifiers/publicly_available_specification.rb +101 -0
  314. data/lib/pubid/iec/identifiers/sheet_identifier.rb +62 -0
  315. data/lib/pubid/iec/identifiers/societal_technology_trend_report.rb +40 -0
  316. data/lib/pubid/iec/identifiers/systems_reference_document.rb +40 -0
  317. data/lib/pubid/iec/identifiers/technical_report.rb +132 -0
  318. data/lib/pubid/iec/identifiers/technical_specification.rb +132 -0
  319. data/lib/pubid/iec/identifiers/technology_report.rb +39 -0
  320. data/lib/pubid/iec/identifiers/test_report_form.rb +78 -0
  321. data/lib/pubid/iec/identifiers/vap_identifier.rb +73 -0
  322. data/lib/pubid/iec/identifiers/white_paper.rb +39 -0
  323. data/lib/pubid/iec/identifiers/working_document.rb +96 -0
  324. data/lib/pubid/iec/parser.rb +417 -0
  325. data/lib/pubid/iec/rendering_style.rb +113 -0
  326. data/lib/pubid/iec/scheme.rb +71 -0
  327. data/lib/pubid/iec/single_identifier.rb +80 -0
  328. data/lib/pubid/iec/supplement_identifier.rb +161 -0
  329. data/lib/pubid/iec/urn_generator.rb +79 -0
  330. data/lib/pubid/iec/urn_parser.rb +90 -0
  331. data/lib/pubid/iec.rb +85 -0
  332. data/lib/pubid/ieee/aiee/builder.rb +71 -0
  333. data/lib/pubid/ieee/aiee/identifier.rb +105 -0
  334. data/lib/pubid/ieee/aiee/parser.rb +130 -0
  335. data/lib/pubid/ieee/aiee.rb +11 -0
  336. data/lib/pubid/ieee/builder.rb +1237 -0
  337. data/lib/pubid/ieee/components/code.rb +102 -0
  338. data/lib/pubid/ieee/components/draft.rb +93 -0
  339. data/lib/pubid/ieee/components/relationship.rb +157 -0
  340. data/lib/pubid/ieee/components/typed_stage.rb +100 -0
  341. data/lib/pubid/ieee/identifier.rb +54 -0
  342. data/lib/pubid/ieee/identifiers/adopted_standard.rb +33 -0
  343. data/lib/pubid/ieee/identifiers/base.rb +591 -0
  344. data/lib/pubid/ieee/identifiers/conformance_identifier.rb +35 -0
  345. data/lib/pubid/ieee/identifiers/corrigendum.rb +37 -0
  346. data/lib/pubid/ieee/identifiers/csa_dual_published.rb +51 -0
  347. data/lib/pubid/ieee/identifiers/dual_identifier.rb +18 -0
  348. data/lib/pubid/ieee/identifiers/dual_published.rb +28 -0
  349. data/lib/pubid/ieee/identifiers/iec_ieee_copublished.rb +27 -0
  350. data/lib/pubid/ieee/identifiers/interpretation_identifier.rb +34 -0
  351. data/lib/pubid/ieee/identifiers/joint_development.rb +172 -0
  352. data/lib/pubid/ieee/identifiers/multi_numbered_identifier.rb +51 -0
  353. data/lib/pubid/ieee/identifiers/nesc/base.rb +56 -0
  354. data/lib/pubid/ieee/identifiers/nesc/draft.rb +28 -0
  355. data/lib/pubid/ieee/identifiers/nesc/handbook.rb +32 -0
  356. data/lib/pubid/ieee/identifiers/nesc/redline.rb +26 -0
  357. data/lib/pubid/ieee/identifiers/nesc/standard.rb +26 -0
  358. data/lib/pubid/ieee/identifiers/nesc.rb +15 -0
  359. data/lib/pubid/ieee/identifiers/parenthetical_identifier.rb +20 -0
  360. data/lib/pubid/ieee/identifiers/project_draft_identifier.rb +26 -0
  361. data/lib/pubid/ieee/identifiers/redlined_standard.rb +33 -0
  362. data/lib/pubid/ieee/identifiers/si_standard.rb +73 -0
  363. data/lib/pubid/ieee/identifiers/standard.rb +41 -0
  364. data/lib/pubid/ieee/identifiers/supplement_identifier.rb +23 -0
  365. data/lib/pubid/ieee/identifiers.rb +33 -0
  366. data/lib/pubid/ieee/ire/builder.rb +61 -0
  367. data/lib/pubid/ieee/ire/identifier.rb +58 -0
  368. data/lib/pubid/ieee/ire/parser.rb +91 -0
  369. data/lib/pubid/ieee/ire.rb +11 -0
  370. data/lib/pubid/ieee/nesc/builder.rb +101 -0
  371. data/lib/pubid/ieee/nesc/parser.rb +154 -0
  372. data/lib/pubid/ieee/nesc.rb +10 -0
  373. data/lib/pubid/ieee/parser.rb +1226 -0
  374. data/lib/pubid/ieee/scheme.rb +90 -0
  375. data/lib/pubid/ieee/typed_stages.rb +172 -0
  376. data/lib/pubid/ieee/urn_generator.rb +188 -0
  377. data/lib/pubid/ieee.rb +32 -0
  378. data/lib/pubid/ieee_debug.rb +31 -0
  379. data/lib/pubid/iho/builder.rb +37 -0
  380. data/lib/pubid/iho/identifier.rb +61 -0
  381. data/lib/pubid/iho/identifiers/base.rb +41 -0
  382. data/lib/pubid/iho/identifiers/bibliographic.rb +16 -0
  383. data/lib/pubid/iho/identifiers/circular_letter.rb +15 -0
  384. data/lib/pubid/iho/identifiers/miscellaneous.rb +16 -0
  385. data/lib/pubid/iho/identifiers/publication.rb +15 -0
  386. data/lib/pubid/iho/identifiers/standard.rb +15 -0
  387. data/lib/pubid/iho/identifiers.rb +14 -0
  388. data/lib/pubid/iho/parser.rb +68 -0
  389. data/lib/pubid/iho/scheme.rb +29 -0
  390. data/lib/pubid/iho/urn_generator.rb +29 -0
  391. data/lib/pubid/iho.rb +21 -0
  392. data/lib/pubid/iso/builder.rb +309 -0
  393. data/lib/pubid/iso/bundled_identifier.rb +85 -0
  394. data/lib/pubid/iso/combined_identifier.rb +22 -0
  395. data/lib/pubid/iso/components/code.rb +36 -0
  396. data/lib/pubid/iso/components/publisher.rb +60 -0
  397. data/lib/pubid/iso/components.rb +12 -0
  398. data/lib/pubid/iso/format_resolver.rb +45 -0
  399. data/lib/pubid/iso/identifier.rb +330 -0
  400. data/lib/pubid/iso/identifiers/addendum.rb +104 -0
  401. data/lib/pubid/iso/identifiers/amendment.rb +128 -0
  402. data/lib/pubid/iso/identifiers/base.rb +115 -0
  403. data/lib/pubid/iso/identifiers/corrigendum.rb +108 -0
  404. data/lib/pubid/iso/identifiers/data.rb +76 -0
  405. data/lib/pubid/iso/identifiers/directives.rb +59 -0
  406. data/lib/pubid/iso/identifiers/directives_supplement.rb +119 -0
  407. data/lib/pubid/iso/identifiers/extract.rb +30 -0
  408. data/lib/pubid/iso/identifiers/guide.rb +100 -0
  409. data/lib/pubid/iso/identifiers/international_standard.rb +168 -0
  410. data/lib/pubid/iso/identifiers/international_standardized_profile.rb +94 -0
  411. data/lib/pubid/iso/identifiers/international_workshop_agreement.rb +89 -0
  412. data/lib/pubid/iso/identifiers/pas.rb +93 -0
  413. data/lib/pubid/iso/identifiers/recommendation.rb +45 -0
  414. data/lib/pubid/iso/identifiers/supplement.rb +87 -0
  415. data/lib/pubid/iso/identifiers/tc_document.rb +108 -0
  416. data/lib/pubid/iso/identifiers/technical_report.rb +103 -0
  417. data/lib/pubid/iso/identifiers/technical_specification.rb +102 -0
  418. data/lib/pubid/iso/identifiers/technology_trends_assessments.rb +95 -0
  419. data/lib/pubid/iso/identifiers.rb +33 -0
  420. data/lib/pubid/iso/parser.rb +512 -0
  421. data/lib/pubid/iso/rendering_style.rb +120 -0
  422. data/lib/pubid/iso/scheme.rb +193 -0
  423. data/lib/pubid/iso/single_identifier.rb +64 -0
  424. data/lib/pubid/iso/supplement_identifier.rb +27 -0
  425. data/lib/pubid/iso/urn_generator.rb +426 -0
  426. data/lib/pubid/iso/urn_parser.rb +437 -0
  427. data/lib/pubid/iso/utilities.rb +86 -0
  428. data/lib/pubid/iso.rb +50 -0
  429. data/lib/pubid/itu/builder.rb +171 -0
  430. data/lib/pubid/itu/components/code.rb +39 -0
  431. data/lib/pubid/itu/components/sector.rb +35 -0
  432. data/lib/pubid/itu/components/series.rb +29 -0
  433. data/lib/pubid/itu/i18n.rb +9 -0
  434. data/lib/pubid/itu/i18n.yaml +30 -0
  435. data/lib/pubid/itu/identifier.rb +118 -0
  436. data/lib/pubid/itu/identifiers/amendment.rb +43 -0
  437. data/lib/pubid/itu/identifiers/annex.rb +74 -0
  438. data/lib/pubid/itu/identifiers/base.rb +154 -0
  439. data/lib/pubid/itu/identifiers/combined_identifier.rb +47 -0
  440. data/lib/pubid/itu/identifiers/corrigendum.rb +44 -0
  441. data/lib/pubid/itu/identifiers/recommendation.rb +16 -0
  442. data/lib/pubid/itu/identifiers/special_publication.rb +31 -0
  443. data/lib/pubid/itu/identifiers/supplement.rb +46 -0
  444. data/lib/pubid/itu/identifiers.rb +16 -0
  445. data/lib/pubid/itu/model.rb +111 -0
  446. data/lib/pubid/itu/parser.rb +225 -0
  447. data/lib/pubid/itu/scheme.rb +174 -0
  448. data/lib/pubid/itu/urn_generator.rb +105 -0
  449. data/lib/pubid/itu.rb +22 -0
  450. data/lib/pubid/jcgm/builder.rb +88 -0
  451. data/lib/pubid/jcgm/components/publisher.rb +20 -0
  452. data/lib/pubid/jcgm/components.rb +9 -0
  453. data/lib/pubid/jcgm/identifier.rb +54 -0
  454. data/lib/pubid/jcgm/identifiers/amendment.rb +35 -0
  455. data/lib/pubid/jcgm/identifiers/guide.rb +21 -0
  456. data/lib/pubid/jcgm/identifiers/gum_guide.rb +51 -0
  457. data/lib/pubid/jcgm/identifiers.rb +11 -0
  458. data/lib/pubid/jcgm/parser.rb +84 -0
  459. data/lib/pubid/jcgm/scheme.rb +60 -0
  460. data/lib/pubid/jcgm/single_identifier.rb +48 -0
  461. data/lib/pubid/jcgm/supplement_identifier.rb +16 -0
  462. data/lib/pubid/jcgm/urn_generator.rb +110 -0
  463. data/lib/pubid/jcgm.rb +31 -0
  464. data/lib/pubid/jis/builder.rb +124 -0
  465. data/lib/pubid/jis/components/code.rb +59 -0
  466. data/lib/pubid/jis/components.rb +9 -0
  467. data/lib/pubid/jis/identifier.rb +61 -0
  468. data/lib/pubid/jis/identifiers/amendment.rb +16 -0
  469. data/lib/pubid/jis/identifiers/base.rb +72 -0
  470. data/lib/pubid/jis/identifiers/explanation.rb +22 -0
  471. data/lib/pubid/jis/identifiers/japanese_industrial_standard.rb +16 -0
  472. data/lib/pubid/jis/identifiers/standard.rb +27 -0
  473. data/lib/pubid/jis/identifiers/technical_report.rb +31 -0
  474. data/lib/pubid/jis/identifiers/technical_specification.rb +31 -0
  475. data/lib/pubid/jis/identifiers.rb +17 -0
  476. data/lib/pubid/jis/parser.rb +109 -0
  477. data/lib/pubid/jis/scheme.rb +49 -0
  478. data/lib/pubid/jis/single_identifier.rb +37 -0
  479. data/lib/pubid/jis/supplement_identifier.rb +47 -0
  480. data/lib/pubid/jis/urn_generator.rb +25 -0
  481. data/lib/pubid/jis.rb +23 -0
  482. data/lib/pubid/lutaml/no_store_registration.rb +30 -0
  483. data/lib/pubid/nist/builder.rb +2269 -0
  484. data/lib/pubid/nist/components/code.rb +38 -0
  485. data/lib/pubid/nist/components/edition.rb +134 -0
  486. data/lib/pubid/nist/components/issue_number.rb +28 -0
  487. data/lib/pubid/nist/components/part.rb +77 -0
  488. data/lib/pubid/nist/components/publisher.rb +24 -0
  489. data/lib/pubid/nist/components/stage.rb +53 -0
  490. data/lib/pubid/nist/components/supplement.rb +188 -0
  491. data/lib/pubid/nist/components/translation.rb +42 -0
  492. data/lib/pubid/nist/components/update.rb +103 -0
  493. data/lib/pubid/nist/components/version.rb +35 -0
  494. data/lib/pubid/nist/components/volume.rb +32 -0
  495. data/lib/pubid/nist/components.rb +19 -0
  496. data/lib/pubid/nist/configuration.rb +77 -0
  497. data/lib/pubid/nist/identifier.rb +62 -0
  498. data/lib/pubid/nist/identifiers/base.rb +578 -0
  499. data/lib/pubid/nist/identifiers/circular.rb +68 -0
  500. data/lib/pubid/nist/identifiers/circular_supplement.rb +50 -0
  501. data/lib/pubid/nist/identifiers/commercial_standard.rb +41 -0
  502. data/lib/pubid/nist/identifiers/commercial_standard_emergency.rb +56 -0
  503. data/lib/pubid/nist/identifiers/commercial_standards_monthly.rb +56 -0
  504. data/lib/pubid/nist/identifiers/crpl_report.rb +132 -0
  505. data/lib/pubid/nist/identifiers/federal_information_processing_standards.rb +104 -0
  506. data/lib/pubid/nist/identifiers/grant_contractor_report.rb +35 -0
  507. data/lib/pubid/nist/identifiers/handbook.rb +50 -0
  508. data/lib/pubid/nist/identifiers/internal_report.rb +56 -0
  509. data/lib/pubid/nist/identifiers/letter_circular.rb +45 -0
  510. data/lib/pubid/nist/identifiers/miscellaneous_publication.rb +65 -0
  511. data/lib/pubid/nist/identifiers/monograph.rb +69 -0
  512. data/lib/pubid/nist/identifiers/ncstar.rb +41 -0
  513. data/lib/pubid/nist/identifiers/nsrds.rb +41 -0
  514. data/lib/pubid/nist/identifiers/owmwp.rb +35 -0
  515. data/lib/pubid/nist/identifiers/report.rb +67 -0
  516. data/lib/pubid/nist/identifiers/special_publication.rb +36 -0
  517. data/lib/pubid/nist/identifiers/technical_note.rb +90 -0
  518. data/lib/pubid/nist/identifiers.rb +33 -0
  519. data/lib/pubid/nist/parser.rb +1117 -0
  520. data/lib/pubid/nist/scheme.rb +199 -0
  521. data/lib/pubid/nist/supplement_identifier.rb +67 -0
  522. data/lib/pubid/nist/urn_generator.rb +133 -0
  523. data/lib/pubid/nist.rb +37 -0
  524. data/lib/pubid/oiml/builder.rb +189 -0
  525. data/lib/pubid/oiml/components/code.rb +20 -0
  526. data/lib/pubid/oiml/components.rb +9 -0
  527. data/lib/pubid/oiml/identifier.rb +61 -0
  528. data/lib/pubid/oiml/identifiers/amendment.rb +13 -0
  529. data/lib/pubid/oiml/identifiers/annex.rb +62 -0
  530. data/lib/pubid/oiml/identifiers/base.rb +36 -0
  531. data/lib/pubid/oiml/identifiers/basic_publication.rb +13 -0
  532. data/lib/pubid/oiml/identifiers/document.rb +13 -0
  533. data/lib/pubid/oiml/identifiers/expert_report.rb +13 -0
  534. data/lib/pubid/oiml/identifiers/guide.rb +13 -0
  535. data/lib/pubid/oiml/identifiers/recommendation.rb +13 -0
  536. data/lib/pubid/oiml/identifiers/seminar_report.rb +13 -0
  537. data/lib/pubid/oiml/identifiers/vocabulary.rb +13 -0
  538. data/lib/pubid/oiml/identifiers.rb +18 -0
  539. data/lib/pubid/oiml/parser.rb +173 -0
  540. data/lib/pubid/oiml/scheme.rb +46 -0
  541. data/lib/pubid/oiml/single_identifier.rb +90 -0
  542. data/lib/pubid/oiml/supplement_identifier.rb +43 -0
  543. data/lib/pubid/oiml/urn_generator.rb +64 -0
  544. data/lib/pubid/oiml.rb +26 -0
  545. data/lib/pubid/parser/common_parse_methods.rb +13 -0
  546. data/lib/pubid/parser/common_parse_rules.rb +56 -0
  547. data/lib/pubid/parser.rb +8 -0
  548. data/lib/pubid/parsers/base.rb +11 -0
  549. data/lib/pubid/parsers/mr_string.rb +93 -0
  550. data/lib/pubid/plateau/builder.rb +50 -0
  551. data/lib/pubid/plateau/identifier.rb +57 -0
  552. data/lib/pubid/plateau/identifiers/annex.rb +16 -0
  553. data/lib/pubid/plateau/identifiers/base.rb +51 -0
  554. data/lib/pubid/plateau/identifiers/handbook.rb +34 -0
  555. data/lib/pubid/plateau/identifiers/technical_report.rb +20 -0
  556. data/lib/pubid/plateau/identifiers.rb +12 -0
  557. data/lib/pubid/plateau/parser.rb +63 -0
  558. data/lib/pubid/plateau/scheme.rb +45 -0
  559. data/lib/pubid/plateau/supplement_identifier.rb +72 -0
  560. data/lib/pubid/plateau/urn_generator.rb +29 -0
  561. data/lib/pubid/plateau.rb +26 -0
  562. data/lib/pubid/renderers/base.rb +53 -0
  563. data/lib/pubid/renderers/directives_renderer.rb +61 -0
  564. data/lib/pubid/renderers/guide_renderer.rb +24 -0
  565. data/lib/pubid/renderers/human_readable.rb +70 -0
  566. data/lib/pubid/renderers/iwa_renderer.rb +20 -0
  567. data/lib/pubid/renderers/mr_string.rb +16 -0
  568. data/lib/pubid/renderers/supplement_renderer.rb +36 -0
  569. data/lib/pubid/renderers/urn.rb +11 -0
  570. data/lib/pubid/renderers.rb +14 -0
  571. data/lib/pubid/rendering/base.rb +73 -0
  572. data/lib/pubid/rendering/common.rb +211 -0
  573. data/lib/pubid/rendering/context.rb +159 -0
  574. data/lib/pubid/rendering/date.rb +27 -0
  575. data/lib/pubid/rendering/format.rb +25 -0
  576. data/lib/pubid/rendering/language.rb +21 -0
  577. data/lib/pubid/rendering/numbering.rb +24 -0
  578. data/lib/pubid/rendering/publisher.rb +25 -0
  579. data/lib/pubid/rendering/stage.rb +38 -0
  580. data/lib/pubid/rendering/supplement.rb +46 -0
  581. data/lib/pubid/rendering.rb +16 -0
  582. data/lib/pubid/sae/builder.rb +32 -0
  583. data/lib/pubid/sae/components/code.rb +9 -0
  584. data/lib/pubid/sae/components/date.rb +19 -0
  585. data/lib/pubid/sae/components/type.rb +19 -0
  586. data/lib/pubid/sae/components.rb +11 -0
  587. data/lib/pubid/sae/identifier.rb +37 -0
  588. data/lib/pubid/sae/identifiers/base.rb +42 -0
  589. data/lib/pubid/sae/identifiers.rb +9 -0
  590. data/lib/pubid/sae/parser.rb +55 -0
  591. data/lib/pubid/sae/scheme.rb +47 -0
  592. data/lib/pubid/sae/urn_generator.rb +38 -0
  593. data/lib/pubid/sae.rb +19 -0
  594. data/lib/pubid/scheme.rb +219 -0
  595. data/lib/pubid/urn_generator/base.rb +110 -0
  596. data/lib/pubid/utils/string_normalizer.rb +196 -0
  597. data/lib/pubid/utils.rb +7 -0
  598. data/lib/pubid/version.rb +3 -1
  599. data/lib/pubid.rb +137 -13
  600. data/lib/tasks/docs.rake +37 -0
  601. data/lib/tasks/export.rake +38 -0
  602. data/lib/tasks/website-data.json +7488 -0
  603. metadata +616 -171
  604. data/lib/pubid/registry.rb +0 -30
@@ -0,0 +1,1117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "parslet"
4
+
5
+ module Pubid
6
+ module Nist
7
+ # Parser class for NIST identifiers
8
+ # Single Responsibility: Parsing NIST identifier syntax
9
+ class Parser < Parslet::Parser
10
+ # Class-level parse method with preprocessing
11
+ # Handles data quality normalization before parsing
12
+ # Named explicitly to avoid conflict with Parslet's built-in parse method
13
+ def self.class_parse_with_preprocessing(input)
14
+ # Apply legacy update_codes normalization first, before any other preprocessing
15
+ cleaned = Core::UpdateCodes.apply(input.to_s.strip, :nist)
16
+
17
+ # Fix lowercase publisher at start
18
+ cleaned = cleaned.sub(/^nbs\b/i, "NBS")
19
+ cleaned = cleaned.sub(/^nist\b/i, "NIST")
20
+
21
+ # Fix publisher+series concatenation: "NISTIR" → "NIST IR", "NBSIR" → "NBS IR"
22
+ # Must come after lowercase publisher fix to catch "nistir" → "NISTIR" → "NIST IR"
23
+ cleaned = cleaned.gsub(
24
+ /^(NBS|NIST)(IR|FIPS|GCR|HB|MONO|MP|NCSTAR|NSRDS)/i, '\1 \2'
25
+ )
26
+
27
+ # Fix lowercase series (ir, sp, tn, etc.)
28
+ cleaned = cleaned.sub(/\b(ir|sp|tn|hb|fips|ams|vts)\b/i, &:upcase)
29
+
30
+ # Normalize LC to LCIRC (single definition of truth)
31
+ # Pattern: "LC" followed by space/dot/end should become "LCIRC"
32
+ # But don't change if already "LCIRC"
33
+ cleaned = cleaned.gsub(/\bLC\b(?!IRC)/, "LCIRC")
34
+
35
+ # Combine "NBS LCIRC" with space into "NBS.LCIRC" ONLY when followed by supplement marker
36
+ # This allows the circ_supplement_identifier rule to match the pattern
37
+ # Only apply to supplement cases, not regular LCIRC identifiers
38
+ cleaned = cleaned.gsub(/\bNBS LCIRC\b(?=.*\b(?:supp?|sup\+|r\d+\/)\d)/,
39
+ "NBS.LCIRC")
40
+
41
+ # Convert MR format LCIRC supplements to space-separated format
42
+ # "NBS.LCIRC.145r11/1925" → "NBS LCIRC 145r11/1925" (convert series dot to space)
43
+ cleaned = cleaned.gsub(/\bNBS\.LCIRC\.(\d+r\d+\/\d{4})/,
44
+ "NBS LCIRC \\1")
45
+ # Also handle without year: "NBS.LCIRC.145r11" → "NBS LCIRC 145r11"
46
+ cleaned = cleaned.gsub(/\bNBS\.LCIRC\.(\d+r\d+)\b/, "NBS LCIRC \\1")
47
+
48
+ # Fix Roman numerals: "1011-I-2" → keep as is, but fix spaces: "1011-I-2 0" → "1011-I-2.0"
49
+ cleaned = cleaned.gsub(/([-\d]+[IVX]+[-\d]+)\s+(\d+)/, '\1.\2')
50
+
51
+ # Fix rev without space: "126rev2013" → "126 rev2013" (separate number from rev+year)
52
+ # BUT preserve edition+revision patterns: "e2rev1908" stays as-is
53
+ cleaned = cleaned.gsub(/(?<!e)(\d)(rev\d{4})/, '\1 \2')
54
+
55
+ # Fix LCIRC revision with slash and year: "145r6/1925" → "145 r6/1925"
56
+ # BUT NOT for LCIRC series (keep "NBS LCIRC 145r11/1925" as-is for parser)
57
+ # The circ_supplement_identifier rule expects "145r11" (no space)
58
+ unless cleaned.include?("LCIRC") || cleaned.include?("CIRC")
59
+ cleaned = cleaned.gsub(/(\d)(r\d+\/\d{4})/, '\1 \2')
60
+ end
61
+
62
+ # Fix LCIRC revision with just year (no slash): "1128r1995" → "1128 r1995"
63
+ # BUT preserve edition+revision patterns: "13e2rev1908" stays as-is
64
+ # AND preserve month abbreviations in patterns like "107-Mar1985" (ar1985 contains 'r')
65
+ # Use word boundary to ensure 'r' is standalone, not part of a month name
66
+ # AND preserve "rv" (revision year) patterns: "1013rv1953" stays as-is
67
+ cleaned = cleaned.gsub(/\b(r(?!v)\d{4})\b/, ' \1')
68
+
69
+ # Fix month in revision: "4743rJun1992" → "4743 rJun1992" (NEW)
70
+ cleaned = cleaned.gsub(/(\d)(r[A-Z][a-z]{2,8}\d{4})/, '\1 \2')
71
+ # REMOVED: Revision with 1-2 digits + lowercase letter preprocessing
72
+ # This is now handled by the more comprehensive fix at lines 131-142
73
+ # which keeps "22r1a" together (no space) for second_number pattern matching
74
+
75
+ # CRITICAL: Normalize lowercase letter suffix to uppercase
76
+ # Fix dash-letter pattern: "6529-a" → "6529-A" (FIXED - was incorrect)
77
+ # BUT preserve lowercase for NCSTAR series when letter is followed by volume (e.g., "1-1av1")
78
+ cleaned = cleaned.gsub(/(\d)-([a-z])$/) { "#{$1}-#{$2.upcase}" }
79
+
80
+ # Fix direct letter suffix (no dash): "378g" → "378G", "1000a" → "1000A"
81
+ # MUST come after dash pattern to avoid conflicts
82
+ # Fix letter suffix at end: "1011-A" → "1011A", "97-3b" → "97-3B"
83
+ # CRITICAL: Exclude r+digit pattern (e.g., "73-197r", "6945r") from this conversion
84
+ # These should remain as lowercase for edition pattern matching
85
+ # Only match single letter at end, not part of words like "index", "sec", etc.
86
+ cleaned = cleaned.gsub(/(\d)([a-z&&[^r]])$/) { "#{$1}#{$2.upcase}" }
87
+ # Also fix r+letter patterns (e.g., "22r1a" → "22r1A") separately
88
+ cleaned = cleaned.gsub(/(\d)(r)(\d+)([a-z])$/) do
89
+ "#{$1}#{$2}#{$3}#{$4.upcase}"
90
+ end
91
+ # NEW: Fix letter suffix before r (e.g., "53ar1" → "53Ar1")
92
+ # For patterns like NIST SP 800-53ar1 where letter is between number and revision
93
+ cleaned = cleaned.gsub(/(\d)([a-z])(r\d)/) { "#{$1}#{$2.upcase}#{$3}" }
94
+ # NOTE: Removed uppercase letter before r rule - it was breaking 800-56Ar2 parsing
95
+ # The parser should handle 56Ar2 as a single unit (letter suffix + revision)
96
+
97
+ # Fix letter suffix before volume: "1-2bv1" → "1-2Bv1" (MR format)
98
+ # BUT preserve "rv" (revision year) patterns: "1013rv1953" stays as-is
99
+ # Skip for NCSTAR to preserve lowercase letters (patterns like "1-1av1" should stay lowercase)
100
+ is_ncstar = cleaned.include?("NCSTAR")
101
+ unless is_ncstar
102
+ cleaned = cleaned.gsub(/(\d)([a-z&&[^r]])(v\d+)/) do
103
+ "#{$1}#{$2.upcase}#{$3}"
104
+ end
105
+ end
106
+
107
+ # Fix space before volume number: "80-2073 2" → "80-2073 v2" (Session 219)
108
+ # This handles NBS IR 80-2073 2 and NBS IR 80-2073 3 as volume identifiers
109
+ cleaned = cleaned.gsub(/(\d{2}-\d{4})\s+(\d)$/, '\1 v\2')
110
+
111
+ # Fix draft with number: "8270-draft2" → "8270 -draft 2" (Session 253)
112
+ # Space BEFORE dash AND after draft to separate it from report_number
113
+ cleaned = cleaned.gsub(/(\d)-draft(\d)/, '\1 -draft \2')
114
+
115
+ # NEW FIX 2: Draft without dash: "8270draft2" → "8270 -draft 2"
116
+ # More lenient pattern to catch missing dash before draft
117
+ cleaned = cleaned.gsub(/(\d)draft(\d)/, '\1 -draft \2')
118
+
119
+ # Fix supplement typo: "154suprev" → "154supprev" (Session 219)
120
+ cleaned = cleaned.gsub(/(\d)suprev/, '\1supprev')
121
+
122
+ # Fix letter suffix + revision before draft: "140Cr1-draft2" → "140C r1-draft2" (Session 221)
123
+ # Must be BEFORE general draft preprocessing at line 47
124
+ cleaned = cleaned.gsub(/(\d{2,})([A-Z])(r\d+)([-\s]draft\d*)/,
125
+ '\1\2 \3\4')
126
+
127
+ # Convert Roman numeral volumes to Arabic per NIST spec (page 7)
128
+ # "1011-I-2.0" → "1011 v1 ver2.0"
129
+ # "1011-II-1.0" → "1011 v2 ver1.0"
130
+ cleaned = cleaned.gsub(/(\d+)-([IVX]+)-(\d+(?:\.\d+)*)/) do
131
+ number = $1
132
+ roman = $2
133
+ version_part = $3
134
+
135
+ # Convert Roman to Arabic
136
+ arabic = roman_to_arabic(roman)
137
+
138
+ # Convert to volume+version format
139
+ "#{number} v#{arabic} ver#{version_part}"
140
+ end
141
+
142
+ # Fix LCIRC supplement with slash and year: "118supp3/1926" → "118 supp3/1926"
143
+ cleaned = cleaned.gsub(/(\d)(supp\d+\/\d{4})/, '\1 \2')
144
+
145
+ # Fix Pt pattern: "800-57Pt3r1" → "800-57 pt3 r1"
146
+ cleaned = cleaned.gsub(/(\d)Pt(\d+)(r\d+)/, '\1 pt\2 \3')
147
+
148
+ # Fix version patterns: "ver1e2006" → "ver1 e2006", "ver2v1" → "ver2 v1"
149
+ cleaned = cleaned.gsub(/(\d)ver(\d)/, '\1 ver \2')
150
+ cleaned = cleaned.gsub(/ver(\d+)e(\d{4})/, 'ver\1 e\2')
151
+ cleaned = cleaned.gsub(/ver(\d+)v(\d+)/, 'ver\1 v\2')
152
+
153
+ # Fix dotted version: separate from number "268v1.1" → "268 v1.1"
154
+ cleaned = cleaned.gsub(/(\d)(v\d+\.\d+)/, '\1 \2')
155
+
156
+ # CRITICAL: Now separate dotted versions from preceding digits: "268v1.1" → "268 v1.1" (NEW)
157
+ cleaned = cleaned.gsub(/(\d)(v\d+\.\d+)/, '\1 \2')
158
+
159
+ # NEW: Separate version from number AND convert spaces to dots in one step
160
+ cleaned = cleaned.gsub(/(\d)(v\d+)\s+(\d+)$/, '\1 \2.\3') # Two-part: "268v1 1" → "268 v1.1"
161
+ cleaned = cleaned.gsub(/(\d)(v\d+)\s+(\d+)\s+(\d+)$/, '\1 \2.\3.\4') # Three-part: "63v1 0 1" → "63 v1.0.1"
162
+
163
+ # Fix volume ranges: "535v2a-l" → "535 v2a-l", "535v2m-z" → "535 v2m-z"
164
+ cleaned = cleaned.gsub(/(\d)(v\d+[a-z]-[a-z])/, '\1 \2')
165
+
166
+ # NEW: Fix volume with uppercase letter: "48v3B" → "48 v3B" (Session 220)
167
+ cleaned = cleaned.gsub(/(\d)(v\d+[A-Z])/, '\1 \2')
168
+
169
+ # NEW: Fix volume ranges with uppercase: "v2A-L" → "v2a-l" (normalize to lowercase) (Session 220)
170
+ cleaned = cleaned.gsub(/(v\d+)([A-Z])-([A-Z])/, '\1\2-\3'.downcase)
171
+
172
+ # NEW: Fix edition with "ed." suffix: "2006ed." → "e2006" (V1 compatibility)
173
+ # Pattern appears at end of identifier: "NIST SP 260-162 2006ed."
174
+ cleaned = cleaned.gsub(/(\d{4})ed\./, 'e\1')
175
+
176
+ # CRITICAL: Fix revision attached to number BEFORE update patterns!
177
+ # "8115r1-upd" → "8115 r1-upd" so that later "r1-upd" → "r1 -upd" works
178
+ # But preserve r6/1925 format (don't add space before slash/year)
179
+ # And preserve 300-8r1/upd format (don't separate r1/upd)
180
+ # ENHANCED: Also handle r1a (revision with letter suffix) - "800-22r1a" → "800-22r1A"
181
+ # FIXED: When there's a letter suffix, keep together for second_number pattern
182
+ # CRITICAL: Use \d{1,2} instead of \d+ to limit revision to 1-2 digits, allowing [a-z] to match
183
+ # First rule: Match r+digit+letter (keep together)
184
+ cleaned = cleaned.gsub(/(\d+)(r\d{1,2})([a-z])(?=-|[A-Z]|$)/) do
185
+ num = $1
186
+ rev = $2
187
+ letter = $3
188
+ # Keep together when there's a letter suffix
189
+ "#{num}#{rev}#{letter.upcase}"
190
+ end
191
+ # Second rule: Match r+digit WITHOUT letter suffix
192
+ # CRITICAL: Use negative lookahead (?![a-zA-Z]) to avoid matching when there's a letter
193
+ # PRESERVE compact format (no space) when at end of string (NIST SP 800-53r4)
194
+ # ADD space only when followed by: dash+uppercase, uppercase letter, or /upd, /errata, /insert
195
+ cleaned = cleaned.gsub(/(\d+)(r\d{1,2})(?![a-zA-Z])(?=[A-Z]|-(?=[A-Z])|\/(?:upd|errata|insert))/) do
196
+ num = $1
197
+ rev = $2
198
+ # Add space when followed by dash+uppercase, uppercase, or update keyword
199
+ "#{num} #{rev}"
200
+ end
201
+
202
+ # Fix spaces in version/volume numbers: "v1 1" → "v1.1", "1011-I-2 0" → "1011-I-2.0"
203
+ # ENHANCED to handle multiple spaces: "v1 0 1" → "v1.0.1", "v1 0 2" → "v1.0.2"
204
+ # FIXED: Pattern must start with "v" or digit to avoid matching "rev 2013" as "v" + " 2013"
205
+ # CRITICAL: Added word boundary \b to prevent matching "v" within "rev"
206
+ # CRITICAL FIX: Use \b to ensure match starts at word boundary
207
+ cleaned = cleaned.gsub(/(\b(?:v|\d)[v\d]*[-A-Z]*)\s+(\d+)(?!(?i:pd|wd|prd)\b)\s+(\d+)(?!(?i:pd|wd|prd)\b)/, '\1.\2.\3') # Three parts
208
+ # CRITICAL FIX: Use \b to ensure match starts at word boundary.
209
+ # Negative lookahead: don't swallow the digit of a numeric draft
210
+ # stage ("189 2pd" must stay split, not become "189.2pd"); letter
211
+ # stages ("ipd") already don't match the trailing \d+.
212
+ cleaned = cleaned.gsub(/(\b(?:v|\d)[v\d]*)\s+(\d+)(?!(?i:pd|wd|prd)\b)/, '\1.\2') # Two parts
213
+
214
+ # Fix update patterns: ensure space before -upd or /upd (not just at end)
215
+ # Enhanced to handle optional digits after upd: -upd, -upd1, /upd, /upd1
216
+ cleaned = cleaned.gsub(/(\d+)-upd(\d*)/, '\1 -upd\2') # -upd or -upd1
217
+ cleaned = cleaned.gsub(/(\d+)\/upd(\d*)/, '\1 /upd\2') # /upd or /upd1
218
+ cleaned = cleaned.gsub(/([a-z]\d+)-upd/, '\1 -upd') # r1-upd → r1 -upd
219
+ cleaned = cleaned.gsub(/([a-z]\d+)\/upd/, '\1 /upd') # After revision: r1/upd → r1 /upd
220
+
221
+ # NEW FIX 3: MR format with letter suffix before update: "8286C-upd1" → "8286C -upd1"
222
+ # Must handle uppercase letters before -upd in MR format
223
+ cleaned = cleaned.gsub(/(\d+[A-Z])-upd(\d*)/, '\1 -upd\2') # Letter suffix + update
224
+ cleaned = cleaned.gsub(/(\d+[A-Z])\/upd(\d*)/, '\1 /upd\2') # Letter suffix + /upd variant
225
+
226
+ # Fix supplement patterns: ensure space before supplement (1st variant)
227
+ # "118supp3" already handled at line 32-33, but add "sup" variant
228
+ cleaned = cleaned.gsub(/(\d)(sup\d)/, '\1 \2') # 100-2sup1 → 100-2 sup1
229
+ # Fix supplement patterns: ensure space before supplement (2nd variant)
230
+ cleaned = cleaned.gsub(/(\d)(sup+)(\d)/, '\1 \2\3') # 100-2sup+1 → 100-2 sup+1
231
+ # Fix supplement patterns: ensure space before supplement (3rd variant)
232
+ cleaned = cleaned.gsub(/(\d)(sup\+)(\d)/, '\1 \2\3') # 100-2sup+1 → 100-2 sup+1
233
+ # Fix supplement patterns: ensure space before supplement (4th variant)
234
+ cleaned = cleaned.gsub(/(\d)(sup\d+)/, '\1 \2') # 100-2sup1 → 100-2 sup1
235
+ # Fix supplement patterns: ensure space before supplement (5th variant)
236
+ cleaned = cleaned.gsub(/(\d)(sup\d+\b)/, '\1 \2') # 100-2sup1 → 100-2 sup1
237
+
238
+ # Fix letter suffix + supplement: "378Gsup" → "378Gsupp" (NEW for LCIRC patterns)
239
+ # Normalize "sup" to "supp" for letter suffix patterns to match circ_supplement_identifier rule
240
+ cleaned = cleaned.gsub(/(\d+[A-Z])sup(\b)/, '\1supp\2') # 378Gsup → 378Gsupp
241
+
242
+ # Fix LCIRC supplement without letter suffix: "118sup12/1926" → "118supp12/1926"
243
+ # Normalize "sup" to "supp" for LCIRC patterns to match circ_supplement_identifier rule
244
+ cleaned = cleaned.gsub(/(\d+)sup(\d+\/\d{4})/, '\1supp\2') # 118sup12/1926 → 118supp12/1926
245
+
246
+ # Unify dashed/undashed year supplements: "supp-YYYY" → "suppYYYY".
247
+ # A bare dash before a 4-digit year is not semantic — "25supp-1924" and
248
+ # "25supp1924" denote the same publication (the genuine edition marker is
249
+ # explicit "e", e.g. "25suppe1924"). Collapsing the dash here gives both
250
+ # spellings ONE parse tree (the normal first_number path), so they build
251
+ # to an identical Circular with supplement=<year>, with equal ==/URN.
252
+ # Guard: 4 digits NOT followed by another digit or a slash, so the
253
+ # dash-slash form "supp-12/1926" (supplement_dash_slash_year) is untouched.
254
+ cleaned = cleaned.gsub(/(\d)(supp?)-(\d{4})(?![\d\/])/, '\1\2\3') # 25supp-1924 → 25supp1924
255
+
256
+ # REMOVED: Revision letter patterns that add space before revision with letter
257
+ # These conflicted with the fix at lines 131-142 which keeps "22r1a" together
258
+ # for second_number pattern matching. The comprehensive fix now handles:
259
+ # - "800-22r1a" → "800-22r1A" (kept together, uppercase letter)
260
+ # - "800-22r1" → "800-22 r1" (space added when no letter suffix)
261
+
262
+ # Fix number with letter suffix followed by standalone 'r': "56ar" → "56a r" (NEW)
263
+ cleaned = cleaned.gsub(/(\d[a-z])r\b/, '\1 r')
264
+
265
+ # Fix revision followed by language code: "r1es" → "r1 es", "r1pt" → "r1 pt" (NEW)
266
+ cleaned = cleaned.gsub(/(r\d+)(es|pt|chi|viet|port|esp)\b/, '\1 \2')
267
+
268
+ # Fix MR format translation codes: ".spa" → " spa", ".por" → " por", ".ind" → " ind" (NEW)
269
+ # Prevents 3-letter translation codes from being parsed as letter suffixes
270
+ # "NIST.SP.1262.spa" → "NIST.SP.1262 spa" (convert dot to space)
271
+ cleaned = cleaned.gsub(/^([A-Z]+)\.SP\.(\d+)\.([a-z]{2,4})$/,
272
+ '\1.SP.\2 \3')
273
+ cleaned = cleaned.gsub(/^([A-Z]+)\.([A-Z]+)\.(\d+)\.([a-z]{2,4})$/,
274
+ '\1.\2.\3 \4')
275
+
276
+ # ENHANCEMENT 1: Edition year normalization (-YYYY → eYYYY)
277
+ # Per NIST spec, trailing -YYYY should normalize to eYYYY format
278
+ # Pattern: number (optionally with non-e letter suffix) followed by dash and 4-digit year
279
+ # Examples: "330-2019" → "330e2019", "304a-2017" → "304Ae2017"
280
+ # Must NOT match existing edition patterns like "11e2-1915" (e2 is edition, -1915 is separate)
281
+ # Must be at end or before space to avoid breaking number-number patterns like "800-53"
282
+ # Negative lookbehind (?<![eE-]) prevents matching after e/E or dash (avoids e2-1915 and 105-1-1990)
283
+ # EXCLUSION: Do NOT convert -YYYY for HB series (handbooks) - preserve original format
284
+ # Example: "NBS HB 130-1979" should stay as "NBS HB 130-1979" (not convert to e1979)
285
+ # EXCLUSION: Do NOT convert -YYYY when preceded by "e\d+" (edition+year pattern like "44e2-1955")
286
+ # EXCLUSION: Only convert years in NBS (1901-1988) or NIST (1988-2099) range
287
+ # Numbers outside this range are part numbers, not edition years (e.g., SP 250-1039)
288
+ # Use a more specific pattern: only convert when NOT preceded by "e" + digits (edition)
289
+ # AND only convert when year is in valid range (1901-2099)
290
+ cleaned = cleaned.gsub(/(?<!e\d)(?<![eE-])(\d(?:[A-DF-Z]?))-(\d{4})(?=\s|$)/) do |match|
291
+ prefix = $1 # Number with optional letter
292
+ year = $2.to_i
293
+ # Only convert to edition format if year is in valid range
294
+ if year.between?(1901, 2099)
295
+ "#{prefix}e#{year}"
296
+ else
297
+ match # Keep dash format for part numbers (e.g., 250-1039)
298
+ end
299
+ end
300
+ # Revert the conversion for HB series to preserve -YYYY format
301
+ # Matches both "HB 130e1979" and "HB 105-1e1990" patterns
302
+ # Use [^:\s.]*? (exclude dots) to avoid consuming MR format dot separators
303
+ # This prevents "NIST.HB.135e2022" from being incorrectly reverted
304
+ cleaned = cleaned.gsub(/\b(HB|HB\s+)[^:\s.]*?(\d+)e(\d{4})(?=\s|$)/,
305
+ '\1\2-\3')
306
+ # Revert the conversion for OWMWP series to preserve date format MM-DD-YYYY
307
+ # OWMWP uses date as the number: "06-13-2018" (not an edition)
308
+ # Pattern: "OWMWP 06-13e2018" → "OWMWP 06-13-2018"
309
+ cleaned = cleaned.gsub(
310
+ /\b(OWMWP|OWMWP\s*)[^:\s]*?(\d{2})-(\d{2})e(\d{4})(?=\s|$)/, '\1\2-\3-\4'
311
+ )
312
+ # Revert the conversion for RPT series to preserve year range format YYYY-YYYY
313
+ # Report series uses year ranges as the number: "1946-1947" (not an edition)
314
+ # Pattern: "RPT 1946e1947" → "RPT 1946-1947"
315
+ # Note: This must check that first year < second year (forward range)
316
+ cleaned = cleaned.gsub(/\b(RPT|RPT\s*)([^:\s]*?)(\d{4})e(\d{4})(?=\s|$)/) do |match|
317
+ prefix = $1 # "RPT" or "RPT "
318
+ separator = $2 # "." or "" or other non-colon, non-space chars
319
+ first_year = $3.to_i
320
+ second_year = $4.to_i
321
+ # Only revert if first < second (year range like 1946-1947)
322
+ if first_year < second_year
323
+ "#{prefix}#{separator}#{first_year}-#{second_year}"
324
+ else
325
+ match # Keep e format for editions like e2018e2019
326
+ end
327
+ end
328
+
329
+ # ENHANCEMENT 2: Version normalization (v1.1 → ver1.1, Ver. 2.0 → ver2.0)
330
+ # Normalize short v format to verbose ver format per NIST spec
331
+ # Already handled in version rule, but normalize in preprocessing for consistency
332
+
333
+ # CRITICAL: MR format version normalization must come BEFORE general v normalization
334
+ # Pattern: "NIST.SP.500-281-v1.0" → "NIST.SP.500-281.ver1.0"
335
+ # This allows report_number to match "500-281" and version rule to match ".ver1.0"
336
+ cleaned = cleaned.gsub(/-v(\d+\.\d+)/, '.ver\1')
337
+
338
+ # Handle Ver. with period: "Ver. 2.0" → "ver2.0" (remove period and space)
339
+ cleaned = cleaned.gsub(/\bVer\.\s+(\d+(?:\.\d+)*)/, 'ver\1')
340
+ # Handle verbose "v" to "ver": "v1.1" → "ver1.1" (only with dots - versions have dots)
341
+ cleaned = cleaned.gsub(/\bv(\d+\.\d+(?:\.\d+)*)/, 'ver\1')
342
+
343
+ # Fix uppercase P for part: "428P1" → "428 p1", "647P2" → "647 p2" (NEW)
344
+ cleaned = cleaned.gsub(/(\d)P(\d)/, '\1 p\2')
345
+
346
+ # Normalize part notation: "p1" → "pt1", "n1" → "pt1" for consistency
347
+ # This handles patterns like "61p1" → "61pt1" and "467n1" → "467pt1"
348
+ # MUST come AFTER uppercase P normalization
349
+ # EXCLUDE pattern: {number}p{digit}{4-digit-year} like "28p11969" (part + year, not part notation)
350
+ # Use negative lookahead to avoid matching when p/n + digit is followed by exactly 4 digits (year)
351
+ cleaned = cleaned.gsub(/\b([pn])(\d+)(?!\d{4}\b)/, 'pt\2')
352
+
353
+ # Fix complex part patterns in MR format: ensure space before part
354
+ cleaned = cleaned.gsub(/(\d)([pP]\d+)/, '\1 \2') # .467p1adde1 → .467 p1adde1, 800-57p1 → 800-57 p1
355
+
356
+ # Fix CRPL-F series: ensure space after series (e.g., "CRPL-F-B150" → "CRPL-F-B 150")
357
+ cleaned = cleaned.gsub(/(NBS CRPL-F-[AB])(\d)/, '\1 \2')
358
+ cleaned = cleaned.gsub(/(CRPL-F-[AB])(\d)/, '\1 \2')
359
+
360
+ # Extract volume from number: "17-917v3" → "17-917 v3", "1-1v1" → "1-1 v1"
361
+ # Pattern: digits-digits followed by v and digits (GCR, NCSTAR patterns)
362
+ # MUST be specific to avoid breaking existing "v1.1" patterns
363
+ cleaned = cleaned.gsub(/(\d+-\d+)(v\d+)(?![.\d])/, '\1 \2') # Negative lookahead for dots
364
+
365
+ # pd_suffix rule handles " 2pd" directly (space >> digits >> str("pd"))
366
+ # No preprocessing needed - adding space before "pd" breaks the parser
367
+
368
+ # Fix "Suppl" with space: "955 Suppl" → "955Suppl"
369
+ cleaned = cleaned.gsub(/(\d+)\s+Suppl\b/, '\1Suppl')
370
+
371
+ # Fix verbose "Version" format: " Version 2" → " ver 2"
372
+ cleaned = cleaned.gsub(/\s+Version\s+(\d+)/, ' ver \1')
373
+
374
+ # Fix verbose "Revision" format: " Revision (r)" → " r"
375
+ cleaned = cleaned.gsub(/\s+Revision\s+\(r\)/, " r")
376
+
377
+ # Fix verbose "Part N" → short "ptN": "800-57 Part 2 Rev. 1" →
378
+ # "800-57pt2 Rev. 1". The grammar already accepts short "ptN" (and
379
+ # "ptN Rev. M"); only the verbose spelling was unsupported. Attaches
380
+ # to the preceding number so the existing part rule applies.
381
+ cleaned = cleaned.gsub(/\s+Part\s+(\d+)/, 'pt\1')
382
+
383
+ # Normalize verbose addendum " Add"/" add" (with or without period)
384
+ # to the canonical " Add." the grammar accepts, and uppercase a
385
+ # doc-number letter that immediately precedes it ("800-38a Add" →
386
+ # "800-38A Add.") — NIST doc-number letters are canonically uppercase
387
+ # and the letter_number grammar rule only splits the uppercase form.
388
+ # Scoped to the addendum context so bare markers like "800-90r"
389
+ # (revision) are left untouched.
390
+ cleaned = cleaned.gsub(/(\d[a-z]?)\s+Add\b\.?/i) { "#{Regexp.last_match(1).upcase} Add." }
391
+
392
+ # Fix verbose "rev YYYY" format: "126 rev 2013" → "126r2013"
393
+ # Removes space between number and "rev", and converts to "r" prefix
394
+ # Handles patterns like "NIST SP 260-126 rev 2013" → "NIST SP 260-126r2013"
395
+ cleaned = cleaned.gsub(/(\d+)\s+rev\s+(\d{4})/, '\1r\2')
396
+
397
+ # Fix historical "report ;" format: "NBS report ; 8079" → "NBS RPT 8079"
398
+ # The semicolon and "report" (spelled out) are historical formats
399
+ cleaned = cleaned.gsub(/\breport\s*;\s*/, "RPT ")
400
+ cleaned = cleaned.gsub(/\breport\b/, "RPT")
401
+
402
+ # REMOVED: Incorrect dot preprocessing that treated dots as number separators
403
+ # This was semantically wrong - dots are PART separators in NIST!
404
+ # DELETE: cleaned = cleaned.gsub(/(\d{3,})\.(\d{1,4})(?=\s|$)/, '\1_\2')
405
+
406
+ # REMOVED: Incorrect space-to-underscore that treated as single number
407
+ # DELETE: cleaned = cleaned.gsub(/(\d{3,})\s+(\d{1,2})$/, '\1_\2')
408
+
409
+ # Detect format before parsing
410
+ format = detect_format(input.to_s)
411
+
412
+ # Use parslet parser instance
413
+ result = new.parse(cleaned)
414
+
415
+ # Add format to result
416
+ if result.is_a?(Hash)
417
+ result.merge(parsed_format: format)
418
+ elsif result.is_a?(Array)
419
+ # For array results, merge all hashes into one
420
+ # This handles cases where identifier rule returns multiple components (e.g., compound_series + edition)
421
+ merged = result.inject({}) do |acc, hash|
422
+ next acc unless hash.is_a?(Hash)
423
+
424
+ acc.merge(hash)
425
+ end
426
+ merged.merge(parsed_format: format)
427
+ else
428
+ result
429
+ end
430
+ end
431
+
432
+ # Detect format from input string
433
+ # :mr if contains dots (machine-readable: NIST.SP.800-53)
434
+ # :short otherwise (default: NIST SP 800-53)
435
+ def self.detect_format(input)
436
+ # Check if it has dot separators (MR format pattern)
437
+ # Patterns include:
438
+ # - "NIST.SP.800-53" (publisher.series.number)
439
+ # - "FIPS.46e1977" (series.numberWithEdition)
440
+ # - "NBS.HB.28pt1e1969" (publisher.series.part.edition)
441
+ # Key indicator: dots between components instead of spaces
442
+ if input.include?(".") && !input.match?(/\s/)
443
+ :mr
444
+ else
445
+ :short
446
+ end
447
+ end
448
+
449
+ # Convert Roman numerals to Arabic numbers
450
+ # I→1, II→2, III→3, IV→4, V→5, VI→6, VII→7, VIII→8, IX→9, X→10
451
+ def self.roman_to_arabic(roman)
452
+ case roman
453
+ when "I" then "1"
454
+ when "II" then "2"
455
+ when "III" then "3"
456
+ when "IV" then "4"
457
+ when "V" then "5"
458
+ when "VI" then "6"
459
+ when "VII" then "7"
460
+ when "VIII" then "8"
461
+ when "IX" then "9"
462
+ when "X" then "10"
463
+ else roman # Fallback for unexpected patterns
464
+ end
465
+ end
466
+
467
+ # Basic building blocks
468
+ rule(:space) { str(" ") }
469
+ rule(:dot) { str(".") }
470
+ rule(:dash) { str("-") }
471
+ rule(:slash) { str("/") }
472
+ rule(:digit) { match("[0-9]") }
473
+ rule(:digits) { digit.repeat(1) }
474
+ rule(:letter) { match("[A-Za-z]") }
475
+ rule(:upper_letter) { match("[A-Z]") }
476
+ rule(:lower_letter) { match("[a-z]") }
477
+
478
+ # Hash prefix for machine-readable formats
479
+ rule(:hash_prefix) { str("#") }
480
+
481
+ # Month abbreviations
482
+ rule(:month_abbrev) do
483
+ str("January") | str("February") | str("March") | str("April") |
484
+ str("May") | str("June") | str("July") | str("August") |
485
+ str("September") | str("October") | str("November") | str("December") |
486
+ str("Jan") | str("Feb") | str("Mar") | str("Apr") |
487
+ str("Jun") | str("Jul") | str("Aug") | str("Sep") | str("Oct") | str("Nov") | str("Dec")
488
+ end
489
+
490
+ # Language codes for translations - 2-4 letter codes
491
+ # Supports: " spa", "(spa)", ".spa" (MR format)
492
+ rule(:language_code) do
493
+ ((space | dot).maybe >> (str("es") | str("pt") | str("chi") | str("viet") | str("port") | str("esp") |
494
+ match("[a-z]").repeat(2, 4))).as(:translation)
495
+ end
496
+
497
+ # Stage ID: i (initial), f (final), 1-9 (numbered iterations)
498
+ rule(:stage_id) do
499
+ str("i") | str("I") | str("f") | str("F") |
500
+ str("1") | str("2") | str("3") | str("4") | str("5") |
501
+ str("6") | str("7") | str("8") | str("9")
502
+ end
503
+
504
+ # Stage type: pd (public draft), wd (work-in-progress), prd (preliminary)
505
+ rule(:stage_type) do
506
+ str("pd") | str("PD") | str("wd") | str("WD") | str("prd") | str("PRD")
507
+ end
508
+
509
+ # Old style stage: (IPD), (FPD), (2PD) - parenthetical at document start
510
+ rule(:old_stage) do
511
+ str("(") >> (stage_id.as(:stage_id) >> stage_type.as(:stage_type)).as(:stage) >> str(")")
512
+ end
513
+
514
+ # New style stage: " ipd", ".ipd" - inline at document end
515
+ rule(:new_stage) do
516
+ (space | dot) >> (stage_id.as(:stage_id) >> stage_type.as(:stage_type)).as(:stage)
517
+ end
518
+
519
+ # Publisher
520
+ rule(:publisher) do
521
+ (str("NBS") | str("NIST")).as(:publisher)
522
+ end
523
+
524
+ # Compound series (include publisher in series name) - must be checked FIRST
525
+ rule(:compound_series) do
526
+ (
527
+ # Longest patterns first to avoid partial matches
528
+ str("NBS BRPD-CRPL-D") | str("NBS CRPL-F-A") | str("NBS CRPL-F-B") |
529
+ str("NBS CS-E") | str("CSRC Building Block") | str("CSRC Use Case") | str("CSRC Book") |
530
+ str("ITL Bulletin") | str("NSRDS-NBS") |
531
+ # NBS and NIST specific patterns that conflict with simple series
532
+ # CRITICAL: Put longer patterns before shorter to avoid partial matches!
533
+ str("NIST LCIRC") | str("NBS LCIRC") | str("NIST.LCIRC") | str("NBS.LCIRC") | str("NBS RPT") |
534
+ str("NIST PS") | str("NIST DCI") | str("NIST Other") |
535
+ str("NISTPUB") |
536
+ str("NBS CSM") | str("NBS CIRC") | str("NBS.CRPL") | str("NBS CRPL") | str("NBS CS") |
537
+ str("NBS CIS") | str("NBS HR") | str("NBS IRPL") | str("NBS IP") | str("NBS PS") |
538
+ str("NBS BH")
539
+ ).as(:series)
540
+ end
541
+
542
+ # Simple series (no publisher prefix)
543
+ rule(:simple_series) do
544
+ (
545
+ str("AMS") | str("VTS") | # NEW - Added for NIST AMS and VTS series
546
+ str("BSS") | str("BMS") | str("BH") |
547
+ str("FIPS") | str("GCR") | str("HB") | str("MONO") |
548
+ str("MP") | str("NCSTAR") | str("NSRDS") | str("IR") |
549
+ str("SP") | str("TN") | str("CSWP") |
550
+ str("AI") | str("CIRC") | str("CS") | str("CSM") |
551
+ str("CRPL") | str("LCIRC") | str("OWMWP") | str("PC") | str("RPT") |
552
+ str("SIBS") | str("TIBM") | str("TTB") | str("EAB") |
553
+ str("JPCRD") | str("JRES")
554
+ ).as(:series)
555
+ end
556
+
557
+ # Suffix letter(s) after number - supports single letters and specific two-letter suffixes
558
+ # Two-letter suffixes: Ur (Unclassified Revised), Ua (Unclassified Amended), Ub-Uj (series variants)
559
+ # Single letter: any letter not followed by excluded keywords
560
+ rule(:number_suffix) do
561
+ (str("U") >> lower_letter) | (match("[a-zA-Z]") >> (
562
+ # Match suffixes
563
+ str("ec") |
564
+ str("ndex") |
565
+ str("nsert") |
566
+ str("rrata") |
567
+ str("raft") | # NEW: Exclude "draft" from number suffix matching
568
+ str("pp") |
569
+ str("s") |
570
+ str("t") |
571
+ str("hi") |
572
+ str("iet") |
573
+ str("ort") |
574
+ str("r") | # NEW: Exclude "r" revision marker (e.g., r5, r1963)
575
+ str("p") # NEW: Exclude "p" part marker (e.g., 28p11969 - part with year pattern)
576
+ ).absent? >>
577
+ digits.maybe)
578
+ end
579
+
580
+ rule(:digits_with_suffix) do
581
+ digits >>
582
+ # Suffix only if not followed by digit (e.g., don't match 'e' in '140e2')
583
+ (number_suffix >> digit.absent?).maybe
584
+ end
585
+
586
+ # Report number - first part - support edition prefixes like "e104" and supplement suffixes like "144supp"
587
+ # Supplements should be handled as separate parts
588
+ rule(:first_number) do
589
+ (
590
+ # OWMWP date format: MM-DD-YYYY (e.g., 06-13-2018)
591
+ # Must be FIRST to match before other dash patterns
592
+ (match("[0-9]").repeat(2, 2).as(:owmwp_month) >> dash >>
593
+ match("[0-9]").repeat(2, 2).as(:owmwp_day) >> dash >>
594
+ match("[0-9]").repeat(4, 4).as(:owmwp_year)).as(:owmwp_date_number) |
595
+ # Special text patterns - MOST SPECIFIC FIRST (NEW for RPT patterns)
596
+ str("ADHOC") | (str("div") >> digits) |
597
+ # Month ranges for RPT: Apr-Jun1948 (NEW)
598
+ (month_abbrev >> dash >> month_abbrev >> digits) |
599
+ # Number with volume suffix (e.g., "539v10" for CIRC, "1011v1" for general patterns)
600
+ # CRITICAL: Must be before CS series pattern to avoid consuming "GB" as letter suffix
601
+ (digits.as(:number) >> str("v") >> digits.as(:volume_suffix)).as(:number_with_volume) |
602
+ # Roman numeral patterns: 1011-I-2.0, 1011-II-1.0 (ENHANCED to accept optional dots)
603
+ (digits >> dash >> (str("III") | str("II") | str("IV") | str("I") | str("V") | str("VI") | str("VII") | str("VIII") | str("IX") | str("X")) >> dash >> digits >> (dot >> digits).maybe) |
604
+ # GB series pattern: 1190GB-1, 1190GB-4A
605
+ (digits >> str("GB") >> dash >> digits >> upper_letter.maybe) |
606
+ # CS series pattern with letter in middle: 102E-42, 123A-50
607
+ (digits >> upper_letter >> dash >> digits) |
608
+ # Volume-number format for CSM series: v6n1, v7n12
609
+ # CHANGED: Capture volume and issue_number separately for proper semantics
610
+ (str("v") >> digits.as(:volume_number) >> str("n") >> digits.as(:issue_number)) |
611
+ # Regular number with supplement and revision suffix: "154supprev"
612
+ (digits >> str("supprev")) |
613
+ # Regular number with edition and revision year-only: "13e2rev1908"
614
+ (digits >> str("e") >> digits >> str("rev") >> digits) |
615
+ # NEW: Number with revision year (rv pattern for LetterCircular): "1013rv1953"
616
+ (digits.as(:number) >> str("rv") >> digits.as(:revision_year)).as(:number_with_rev_year) |
617
+ # Regular number with edition, revision, and month-date: "13e2revJune1908"
618
+ (digits >> str("e") >> digits >> str("rev") >> month_abbrev >> digits) |
619
+ # Regular number with eN suffix and optional supplement (e.g., "101e2supp") - most specific
620
+ (digits >> str("e") >> digits >> str("supp") >> digits.maybe) |
621
+ # Edition prefix with revision and date: e2revJune1908
622
+ (str("e") >> digits >> str("rev") >> month_abbrev >> digits) |
623
+ # Edition prefix followed by digits and optional supplement with digits
624
+ (str("e") >> digits >> str("supp") >> digits.maybe) |
625
+ # Regular number with eN suffix (e.g., "101e2")
626
+ (digits >> str("e") >> digits) |
627
+ # NEW: Bare edition (just "e2" without number prefix)
628
+ (str("e") >> digits >> (dash >> digits).absent?) |
629
+ # Letter prefix with digits (e.g., "c4" for CRPL)
630
+ (lower_letter >> digits) |
631
+ # Regular number with supplement suffix with month/year (e.g., "24suppJan1924")
632
+ (digits >> str("supp") >> month_abbrev >> digits) |
633
+ # Regular number with supplement suffix (e.g., "144supp") - with optional digits
634
+ (digits >> str("supp") >> digits.maybe) |
635
+ # Regular number with supplement suffix followed by month/year for date range
636
+ (digits >> str("sup") >> month_abbrev >> digits) |
637
+ # Regular number with "sup" suffix (e.g., "9350sup") - NEW for RPT patterns
638
+ (digits >> str("sup")) |
639
+ # Language code suffix without separator (e.g., "1088sp")
640
+ # Must come BEFORE general suffix pattern to capture specific language codes
641
+ # Must come AFTER other patterns (like sup, supp, etc.) to avoid consuming them
642
+ # Note: Preprocessing doesn't convert attached suffixes, so we handle both cases
643
+ (digits.as(:number) >> (str("sp") | str("pt") | str("es") | str("SP") | str("PT") | str("ES")).as(:language_code) >> (upper_letter.absent? >> digit.absent? >> letter.absent? >> dash.absent? >> dot.absent?)) |
644
+ # Part+edition suffix for MR format: "28pt1e1969" (part notation + edition year)
645
+ # Handles patterns like "NBS.HB.28pt1e1969" where part and edition are attached
646
+ # Must come BEFORE language code pattern to take priority
647
+ (digits.as(:number) >> str("pt") >> digits.as(:part_number) >> str("e") >> digits.as(:edition_year)) |
648
+ # Parenthetical language code (e.g., "378(sp)")
649
+ # Must come AFTER other patterns to avoid consuming letter suffixes
650
+ # Note: Preprocessing converts content inside parentheses to uppercase
651
+ # Use specific patterns to avoid consuming other parenthetical content
652
+ (digits.as(:number) >> str("(") >> (str("SP") | str("PT") | str("ES")).as(:language_code) >> str(")")) |
653
+ # Regular number with optional suffix (original) - includes letters like "A"
654
+ digits_with_suffix
655
+ ).as(:first_number)
656
+ end
657
+
658
+ # Second number (after dash) - allow pt suffix, letter suffixes, and CRPL patterns
659
+ rule(:second_number) do
660
+ # Explicitly exclude month abbreviations at start (so -Feb1985 goes to edition, not second_number)
661
+ month_abbrev.absent? >>
662
+ # NEW: Exclude "draft" keyword
663
+ str("draft").absent? >>
664
+ (
665
+ # Trailing bare supplement marker on a compound second number
666
+ # (e.g. "800-53sup") so it isn't split into "53s" + "up". Builder
667
+ # strips the marker and sets supplement="" (canonical "sup").
668
+ (digits >> (str("supp") | str("sup")) >>
669
+ (digit.absent? >> letter.absent?)) |
670
+ # NEW: Revision pattern with U+letter suffix (e.g., "22r1Ua", "38Ua")
671
+ # MUST come BEFORE general letter suffix to avoid matching just "U" from "Ua"
672
+ (digits >> str("r") >> digits >> str("U") >> lower_letter) |
673
+ # NEW: Revision pattern with letter suffix (e.g., "22r1a", "22r1A" for SP patterns)
674
+ # This allows second_number to match the entire "22r1A" as a single unit
675
+ # MUST come BEFORE plain r+digits to avoid greedy match of just "22r1"
676
+ (digits >> str("r") >> digits >> match("[a-zA-Z]")) |
677
+ # NEW: Revision pattern with year (e.g., "126r2013")
678
+ # This handles SP revision format where revision is attached to second_number
679
+ (digits.as(:number_only) >> str("r") >> digits.as(:edition_id)) |
680
+ # CRPL range with underscore (e.g., "2_3-1A")
681
+ (digits >> str("_") >> digits >> dash >> digits >> upper_letter.maybe) |
682
+ # Letter followed by dash and digits (e.g., "m-5")
683
+ (lower_letter >> dash >> digits) |
684
+ # Number with pt suffix (e.g., "57pt1")
685
+ # EXCLUDE pt#-# patterns (e.g., "pt3-1") which are part components for CRPL
686
+ # Use negative lookahead to prevent matching when followed by dash
687
+ (digits >> str("pt") >> digits >> dash.absent?) |
688
+ # Number with uppercase letter suffix (e.g., "56A", "123B") - for patterns like "56Ar2"
689
+ (digits >> upper_letter) |
690
+ # NEW: Revision pattern where r is directly followed by a letter (e.g., "27ra" -> rA)
691
+ # For patterns like NIST SP 800-27ra where revision 'ra' is attached directly to number
692
+ (digits.as(:number_only) >> str("r") >> match("[a-zA-Z]").as(:letter)).as(:revision_letter) |
693
+ # NEW: Revision pattern where r is directly followed by a letter without leading digits (e.g., "rA")
694
+ # For patterns like NIST SP 800-27ra where revision 'ra' is attached directly to number
695
+ (str("r") >> match("[a-zA-Z]")).as(:revision_letter_suffix) |
696
+ # NEW: Simple revision pattern r followed by digits (e.g., "r1", "r2") for trailing revision
697
+ (str("r") >> digits.as(:edition_id)).as(:revision_simple) |
698
+ # Special patterns like "NCNR", "PERMIS", "BFRL"
699
+ str("NCNR") | str("PERMIS") | str("BFRL") |
700
+ # Just capital letters (e.g., "A", "B", "C") - standalone
701
+ upper_letter.repeat(1, 3) |
702
+ # Regular number with optional suffix - but NOT if part of FIPS date (digit-dash-month-digit-slash)
703
+ (digits_with_suffix >> (dash >> month_abbrev >> digits >> slash).absent?) |
704
+ # Single lowercase letter (e.g., "a", "b") - but NOT "r" followed by digits (edition marker)
705
+ # This is for patterns like "126a" but not "126r2"
706
+ (lower_letter >> digit.absent?)
707
+ ).as(:second_number)
708
+ end
709
+
710
+ # Edition component per NIST spec: <edition-type><edition-id>
711
+ # Type: "e" (edition), "r" (revision), "rev" (revision verbose), "-" (historical)
712
+ # ID: number (1-9) or year (yyyy)
713
+ # Examples: e2, e2021, r5, rev2013, rev 2013, -3
714
+ # Enhanced: Support space-separated format from preprocessing (r1 separated from number)
715
+ rule(:edition) do
716
+ # Edition with "e" prefix: e2, e3, e2021 (1-4 digits for ID)
717
+ (space.maybe >> str("e") >> digits.as(:edition_id)).as(:edition_e) |
718
+ # Revision with "r" prefix and SPACE, with letter: r 5A (preserve format)
719
+ (space >> str("r") >> digits.as(:edition_id) >> match("[a-zA-Z]").as(:edition_letter)).as(:edition_r_with_space_letter) |
720
+ # Revision with "r" prefix and SPACE: r 5 (preserve format)
721
+ (space >> str("r") >> digits.as(:edition_id)).as(:edition_r_with_space) |
722
+ # Revision with "r" prefix NO space, with letter: r5A (compact format)
723
+ (str("r") >> digits.as(:edition_id) >> match("[a-zA-Z]").as(:edition_letter)).as(:edition_r_no_space_letter) |
724
+ # Revision with "r" prefix NO space: r5 (compact format)
725
+ (str("r") >> digits.as(:edition_id)).as(:edition_r_no_space) |
726
+ # Revision with "rev" prefix (verbose): rev2013, rev 2013
727
+ (space.maybe >> str("rev") >> space.maybe >> digits.as(:edition_id)).as(:edition_rev) |
728
+ # Historical with "-" prefix: -2, -3 (ONLY if followed by non-digit or end)
729
+ # This avoids consuming date patterns like "-1908"
730
+ # Historical precedent uses small numbers (1-9), dates use 4-digit years
731
+ (dash >> match("[1-9]").as(:edition_id) >> digit.absent?).as(:edition_historical) |
732
+ # Edition dash-year pattern: -1979, -1990 (dash + 4-digit year)
733
+ # This matches year-only editions like "NBS HB 130-1979"
734
+ (dash >> match("[0-9]").repeat(4,
735
+ 4).as(:dash_year)).as(:edition_dash_year)
736
+ end
737
+
738
+ # Date component per NIST spec: -{YYYY} or -{YYYYMM} or -{YYYYMMDD}
739
+ # Separate from Edition - both can coexist
740
+ # Examples: -1908, -190806, -19770930
741
+ rule(:date) do
742
+ (
743
+ # Date with month and day: -19770930 (YYYYMMDD)
744
+ (dash >> match("[0-9]").repeat(4, 4).as(:date_year) >>
745
+ match("[0-9]").repeat(2, 2).as(:date_month) >>
746
+ match("[0-9]").repeat(2, 2).as(:date_day)) |
747
+ # Date with month: -190806 (YYYYMM)
748
+ (dash >> match("[0-9]").repeat(4, 4).as(:date_year) >>
749
+ match("[0-9]").repeat(2, 2).as(:date_month)) |
750
+ # Date with year only: -1908 (YYYY)
751
+ (dash >> match("[0-9]").repeat(4, 4).as(:date_year)) |
752
+ # Legacy month format: -June1908, -Jan1925 (normalize to YYYYMM)
753
+ (dash >> month_abbrev.as(:date_month) >> digits.as(:date_year))
754
+ ).as(:date)
755
+ end
756
+
757
+ # LEGACY EDITION PATTERNS (for backward compatibility during migration)
758
+ # These will be gradually replaced as we migrate to proper Edition/Date components
759
+ rule(:legacy_edition) do
760
+ # Complex revision patterns: r1a, r2b
761
+ ((str("r") | str(" R")) >> match("[0-9]").repeat(1,
762
+ 2).as(:edition) >> lower_letter.as(:edition_letter)) |
763
+ # Edition with revision and year: rev2013, rev2020, rev 2013 (with space)
764
+ (str("rev") >> space.maybe >> digits.as(:edition_year)) |
765
+ # Edition with revision and date: e2revJune1908 (will migrate to e2 + date)
766
+ ((str("e") | str(" E")) >> match("[0-9]").repeat(1, 3).as(:edition) >>
767
+ str("rev") >> match("[A-Za-z]").repeat(3,
768
+ 9).as(:edition_month) >> digits.as(:edition_year)) |
769
+ # Edition with year and month: e201801 (ambiguous - could be e2018 or year 2018 month 01)
770
+ (str("e") >> match("[0-9]").repeat(4,
771
+ 4).as(:edition_year) >> match("[0-9]").repeat(
772
+ 2, 2
773
+ ).as(:edition_month).maybe) |
774
+ # Revision-based edition: revJune1908, revJan1925 (normalize to date)
775
+ (str("rev") >> match("[A-Za-z]").repeat(3,
776
+ 9).as(:edition_month) >> digits.as(:edition_year))
777
+ end
778
+
779
+ # CRPL range pattern (e.g., 1-2_3-1, 1-2_3-1A with suffix) - matches after first dash
780
+ rule(:crpl_range) do
781
+ (digits >> str("_") >> digits >> dash >> digits >> upper_letter.maybe).as(:crpl_range)
782
+ end
783
+
784
+ # Full report number - support dot-separated parts AND CRPL ranges
785
+ # ENHANCED: Support multiple dashes for GCR patterns (Session 220)
786
+ # FIXED: Put GCR pattern first to prioritize matching full dash-separated patterns
787
+ # FIXED: Add edition.maybe to support revision patterns like 800-53r5 in short format
788
+ # FIXED: Month abbreviation as edition (e.g., 107-Mar1985, 11-Jan1925)
789
+ # FIXED: FIPS date format with day and slash (e.g., 11-1-Sep30/1977)
790
+ rule(:report_number) do
791
+ first_number >>
792
+ (
793
+ # Month abbreviation as edition (e.g., 107-Mar1985, 11-Jan1925)
794
+ # MUST BE FIRST to catch -MonthYear patterns before they're
795
+ # incorrectly parsed as other alternatives
796
+ (dash >> month_abbrev.as(:edition_month) >> digits.as(:edition_year)) |
797
+ # FIPS date format: -1-Sep30/1977 (part-month-day/year with slash)
798
+ # Must come before GCR pattern to avoid being matched as multi-dash
799
+ (dash >> digits.as(:fips_part) >> dash >> month_abbrev.as(:edition_month) >>
800
+ digits.as(:edition_day) >> slash >> digits.as(:edition_year)) |
801
+ # Dash with decimal suffix (e.g., 80-2073.3, 123-45.67)
802
+ # Must come before GCR pattern which expects another dash after second_number
803
+ (dash >> digits.as(:decimal_base) >> dot >> digits.as(:decimal_suffix)).as(:decimal_number) |
804
+ # Dash with letter suffix (e.g., 1-1A, 1-3B for NCSTAR, 73-197Ur for IR)
805
+ # Must come before GCR pattern which expects another dash
806
+ # Supports U+lowercase letter suffix (e.g., Ur, Ua, Ub-Uj for Unclassified variants)
807
+ # For other uppercase letters, only match single letter (A, B, C) to avoid consuming revision r
808
+ (dash >> digits.as(:letter_base) >> (
809
+ (str("U") >> lower_letter.as(:letter_suffix_extra)) |
810
+ upper_letter
811
+ ).as(:letter_suffix)).as(:letter_number) |
812
+ # Edition dash-year pattern (e.g., -1979 for handbooks like "NBS HB 130-1979")
813
+ # Matches any 4-digit sequence - the builder decides if it's a year or second_number
814
+ (dash >> match("[0-9]").repeat(4,
815
+ 4).as(:dash_year) >> (space | dot | part | crpl_range | second_number | dash).absent?).as(:edition_dash_year) |
816
+ # Second number followed by edition dash-year (e.g., -1-1990 for "105-1-1990")
817
+ # Handles compound numbers with edition year at the end
818
+ # MUST be BEFORE GCR pattern because both start with dash + second_number + dash
819
+ (dash >> second_number >> dash >> match("[0-9]").repeat(4,
820
+ 4).as(:dash_year) >> (space | dot | part | crpl_range | revision | draft).absent?).as(:second_number_edition_year) |
821
+ # FIPS month+year pattern after part (e.g., -1-Sep1977 for "11-1-Sep1977")
822
+ (dash >> second_number >> dash >> month_abbrev.as(:edition_month) >> digits.as(:edition_year) >> (space | dot | part | crpl_range | edition | revision | draft).absent?).as(:fips_month_year_after_part) |
823
+ # GCR multi-dash pattern (e.g., 85-3273-37, 19-200-30B)
824
+ (dash >> second_number >> dash >> (digits >> upper_letter.maybe).as(:part_number)) |
825
+ # Dot-separated part (e.g., 984.4 = number 984, part 4)
826
+ (dot >> second_number) |
827
+ # Dash-separated with optional revision (e.g., 800-53r5, 1019r1963)
828
+ (dash >> (crpl_range | second_number) >> edition.maybe) |
829
+ (dash >> edition)
830
+ # TODO: Language code suffix without separator (e.g., "1088sp")
831
+ # Must come AFTER other patterns to avoid consuming them
832
+ # | (str("sp") | str("pt") | str("es") | match("[a-z]").repeat(2, 4)).as(:language_code) >> (space | dot | part | crpl_range | second_number).absent?) |
833
+ # Parenthetical language code (e.g., "378(sp)")
834
+ # | (str("(") >> match("[a-z]").repeat(2, 4).as(:language_code) >> str(")") >> (space | dot | part | crpl_range | second_number).absent?)
835
+ ).maybe
836
+ end
837
+
838
+ # Volume
839
+ rule(:volume) do
840
+ (space.maybe >> (str("v") | str(" Vol. "))) >>
841
+ (digits >>
842
+ # Support letter ranges (lowercase normalized in preprocessing)
843
+ (str("a-l") | str("m-z") | str("A-L") | str("M-Z")).maybe >>
844
+ # Support single uppercase letters (e.g., v3B, v1A)
845
+ upper_letter.repeat(0, 2)).as(:volume)
846
+ end
847
+
848
+ # Part - enhanced to support patterns like p1adde1 AND pt3r1 (part with revision)
849
+ rule(:part) do
850
+ ((space.maybe >> (str("pt") | str("p") | str("P"))) | str(" Part ")) >>
851
+ (digits >>
852
+ # NEW: Revision after part number: pt3r1, p1r1 (space.maybe for preprocessing)
853
+ (space.maybe >> str("r") >> digits).maybe >>
854
+ # Existing: Addendum with optional edition: add, adde1
855
+ (str("add") >> (str("e") >> digits).maybe).maybe >>
856
+ (dash >> digits).maybe).as(:part)
857
+ end
858
+
859
+ # Revision
860
+ rule(:revision) do
861
+ # NEW: Revision with month and year: rJun1992, r Jun1992 - LONGEST MATCH FIRST
862
+ # Enhanced to support leading space (Session 219)
863
+ (space.maybe >> (str("r") | str("rev")) >> space.maybe >> month_abbrev.as(:revision_month) >> digits.as(:revision_year)) |
864
+ # Revision with slash and year: r6/1925, r11/1924 (NEW for LCIRC patterns)
865
+ (space.maybe >> (str("r") | str("rev")) >> digits.as(:revision) >>
866
+ slash >> digits.as(:revision_year)) |
867
+ # Revision with 4-digit year directly: r1995, r 1995 (allow space before year)
868
+ ((str(" r") | str("r")) >> space.maybe >> match("[0-9]").repeat(4,
869
+ 4).as(:revision_year)) |
870
+ # Revision with year: rev2013, rev 2013 (allow space before year)
871
+ (str("rev") >> space.maybe >> digits.as(:revision_year)) |
872
+ # Revision with digits AND/OR letters: r1a, r1A, ra, r1
873
+ # Enhanced to accept letter-only revisions and space before r
874
+ # ENHANCED: Accept BOTH lowercase and uppercase letters in suffix
875
+ # ENHANCED: Capture original format prefix for format preservation (e.g., " Rev. 5")
876
+ ((str(" rev ") | str("rev") | str(" r") | str("r") | str(" Rev. ") | str(" Revision (r)")).as(:revision_prefix) >>
877
+ ((digits >> match("[a-zA-Z]").maybe) | match("[a-zA-Z]").repeat(1)).as(:revision_id)).as(:revision) |
878
+ # NEW: Standalone 'r' - MUST BE LAST to avoid consuming from other patterns
879
+ # Matches " r" at end of input (after preprocessing: "800-56a r", "800-27 r")
880
+ (str(" r") >> any.absent?).as(:revision_standalone)
881
+ end
882
+
883
+ # Version - V1 SP PARSER COMPATIBLE
884
+ # Supports: ver1.0.2, ver2, " Ver. 2.0", " Version 1.0", v1.0.2, -v1.0, .ver1.0 (MR format)
885
+ rule(:version) do
886
+ # Verbose "ver" form - with or without dots (space.maybe before AND after "ver")
887
+ # ENHANCED: Accept dot prefix for MR format (e.g., "500-281.ver1.0")
888
+ ((space | dot).maybe >> str("ver") >> space.maybe >> (digits >> (dot >> digits).repeat).as(:version)) |
889
+ # Verbose forms with space: " Ver. ", " Version " - require dots
890
+ ((str(" Ver. ") | str(" Version ")) >>
891
+ (digits >> dot >> digits >> (dot >> digits).maybe).as(:version)) |
892
+ # Short form "v" with mandatory dots (v1.0, v1.0.2) - allow optional dash or space before
893
+ ((dash | space).maybe >> str("v") >> (digits >> dot >> digits >> (dot >> digits).maybe).as(:version))
894
+ end
895
+
896
+ # Update - V1 COMPATIBLE
897
+ # Format: /Upd{N}-{YYYY}{MM} where MM is optional
898
+ # Examples: /Upd1-2015, /Upd3-202102, -upd, /upd (after preprocessing)
899
+ # Update number is optional (e.g., "500-300-upd" has no number)
900
+ # Captures prefix to preserve original format (-upd vs /Upd)
901
+ rule(:update) do
902
+ prefix = (
903
+ str("/Upd") |
904
+ (space.maybe >> (str("/upd") | str("-upd")))
905
+ ).as(:update_prefix)
906
+
907
+ prefix >>
908
+ (
909
+ digits.as(:update_number).maybe >>
910
+ (dash >>
911
+ match("[0-9]").repeat(4, 4).as(:update_year) >>
912
+ match("[0-9]").repeat(2, 2).as(:update_month).maybe
913
+ ).maybe
914
+ ).as(:update)
915
+ end
916
+
917
+ # Addendum
918
+ rule(:addendum) do
919
+ ((str("-add") | str(".add") | str(" Add.")) >>
920
+ (space | dash).maybe >> (digits | str("")).as(:addendum_number)).as(:addendum)
921
+ end
922
+
923
+ # Supplement - enhanced to support date patterns, year patterns, and combined with revision
924
+ # Examples: suppJan1924, supp3/1926, supp1925, supJun1925-Jun1927 (date ranges), supprev
925
+ rule(:supplement) do
926
+ space.maybe >>
927
+ (str("supp") | str("sup")) >>
928
+ (
929
+ # Supplement followed by revision: supprev
930
+ str("rev").as(:supplement_with_rev) |
931
+ # Date range pattern: Jan1924-Jan1926
932
+ (month_abbrev.as(:supp_month_start) >> digits.as(:supp_year_start) >>
933
+ dash >> month_abbrev.as(:supp_month_end) >> digits.as(:supp_year_end)).as(:supplement_date_range) |
934
+ # Month and year: Jan1924
935
+ (month_abbrev.as(:supp_month) >> digits.as(:supp_year)).as(:supplement_date) |
936
+ # Number with slash and year: 3/1926
937
+ (digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_slash_year) |
938
+ # Just year: 1925
939
+ digits.as(:supp_year) |
940
+ # General suffix (other patterns)
941
+ match("[A-Za-z0-9]").repeat(1).as(:supplement_suffix)
942
+ ).maybe
943
+ end
944
+
945
+ # Errata
946
+ rule(:errata) do
947
+ (dash.maybe >> (str("errata") | str("err"))).as(:errata)
948
+ end
949
+
950
+ # Index
951
+ rule(:index) do
952
+ (str("index") | str("indx")).as(:index)
953
+ end
954
+
955
+ # Insert
956
+ rule(:insert) do
957
+ (str("insert") | str("ins")).as(:insert)
958
+ end
959
+
960
+ # Appendix
961
+ rule(:appendix) do
962
+ str("app").as(:appendix)
963
+ end
964
+
965
+ # Section - make digits optional for patterns like just "sec"
966
+ rule(:section) do
967
+ str("sec") >> digits.as(:section).maybe
968
+ end
969
+
970
+ # Translation (3-letter language code) - V1 COMPATIBLE
971
+ # Supports: (spa), " spa", ".spa" (MR format)
972
+ rule(:translation) do
973
+ # Parenthetical format: (spa), (por), (ind)
974
+ (str("(") >> match('\w').repeat(3, 3).as(:translation) >> str(")")) |
975
+ # Space-prefix format: " spa"
976
+ (space >> match('\w').repeat(3, 3).as(:translation)) |
977
+ # Dot-prefix format: ".spa" (machine-readable)
978
+ (dot >> match('\w').repeat(3, 3).as(:translation))
979
+ end
980
+
981
+ # Public draft suffix - for patterns like 2pd, 3pd
982
+ rule(:pd_suffix) do
983
+ (space >> digits >> str("pd")).as(:public_draft)
984
+ end
985
+
986
+ # Draft stage - enhanced to support suffix pattern and number after draft
987
+ # ENHANCED: Accept optional space before dash to match after report_number
988
+ rule(:draft) do
989
+ ((space >> str("(Draft)")) |
990
+ (space.maybe >> dash >> str("draft") >> ((space >> digits) | digits).maybe) | # Match " -draft 2" OR "-draft2"
991
+ pd_suffix).as(:draft)
992
+ end
993
+
994
+ # Special date format with slash for FIPS (part of number, not edition)
995
+ rule(:fips_date) do
996
+ dash >> digits.as(:fips_part) >> dash >> month_abbrev.as(:fips_month) >>
997
+ digits.as(:fips_day) >> slash >> digits.as(:fips_year)
998
+ end
999
+
1000
+ # All possible parts (order matters!)
1001
+ rule(:parts) do
1002
+ # Put more specific patterns first
1003
+ # CRITICAL: new_stage BEFORE language_code to avoid "ipd" being treated as translation
1004
+ new_stage |
1005
+ section | index | insert | appendix | pd_suffix |
1006
+ edition | date | legacy_edition | revision |
1007
+ version | # MOVED BEFORE volume - try dotted versions (v1.1) before simple volumes (v1)
1008
+ volume | part | update | addendum |
1009
+ supplement | errata | language_code
1010
+ end
1011
+
1012
+ # CIRC Supplement identifier - split into base + supplement
1013
+ # Examples:
1014
+ # - "NBS CIRC 101e2supp" → base="NBS CIRC 101e2", supplement
1015
+ # - "NBS CIRC 25supp-1924" → base="NBS CIRC 25", supplement_year="1924"
1016
+ # - "NBS CIRC 24suppJan1924" → base="NBS CIRC 24", supplement_edition="Jan1924"
1017
+ # - "NBS CIRC suppJun1925-Jun1926" → date range supplement (no base)
1018
+ # - "NBS LCIRC 378Gsup" → base="NBS LCIRC 378G", supplement (no metadata)
1019
+ # - "NBS.LCIRC.378sup1/1927" → dot-separated MR format (after preprocessing)
1020
+ # Dot-separated machine-readable format: NIST.SP.800-116 or #NIST.2024-01-15.123
1021
+ # Enhanced to support parts after number like NIST.SP.1011-I-2.0
1022
+ # Enhanced to support revision+update patterns like NIST.IR.8115r1-upd
1023
+ rule(:mr_identifier) do
1024
+ hash_prefix.maybe >>
1025
+ publisher >> dot >>
1026
+ simple_series >> dot >>
1027
+ report_number >>
1028
+ # Edition with underscore separator (MR format: 1648_2009)
1029
+ (str("_") >> digits.as(:edition_year)).maybe >>
1030
+ # Support letter suffix before update (e.g., 8286C-upd1) - Session 219
1031
+ upper_letter.maybe >>
1032
+ # Support revision component (r1, r5, etc.) before update
1033
+ edition.maybe >>
1034
+ update.maybe >>
1035
+ # Additional dot-separated parts (parts, version, volume, etc.)
1036
+ # MUST come before translation to avoid conflicting with language codes
1037
+ (dot >> (digits | upper_letter)).repeat(0, 3) >>
1038
+ # Language codes at end (.spa, .por, .ind)
1039
+ parts.repeat >> draft.maybe
1040
+ end
1041
+
1042
+ # Main identifier structure
1043
+ # Try compound series first (longest match), then publisher + simple series
1044
+ rule(:identifier) do
1045
+ circ_supplement_identifier |
1046
+ mr_identifier |
1047
+ (
1048
+ # Compound series (includes publisher in series name)
1049
+ compound_series >> (space | dot) >>
1050
+ old_stage.maybe >> # Old style stage after series
1051
+ report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
1052
+ ) |
1053
+ (
1054
+ # Publisher + simple series - require space/dot between publisher and series
1055
+ publisher >> (space | dot) >>
1056
+ simple_series >>
1057
+ old_stage.maybe >> # Old style stage after series
1058
+ (space | dot) >>
1059
+ report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
1060
+ ) |
1061
+ (
1062
+ # Simple series only (no publisher)
1063
+ simple_series >>
1064
+ old_stage.maybe >> # Old style stage after series
1065
+ (space | dot) >>
1066
+ report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
1067
+ )
1068
+ end
1069
+
1070
+ # CIRC Supplement identifier - split into base + supplement
1071
+ # Must be complete rule with all patterns
1072
+ rule(:circ_supplement_identifier) do
1073
+ (
1074
+ (str("NBS CIRC") | str("NBS LCIRC") | str("NBS.CIRC") | str("NBS.LCIRC")).as(:series) >>
1075
+ (space | dot)
1076
+ ).as(:circ_series) >>
1077
+ (
1078
+ # Date range supplement (no base number)
1079
+ (str("supp") >> month_abbrev.as(:supp_month_start) >> digits.as(:supp_year_start) >>
1080
+ dash >> month_abbrev.as(:supp_month_end) >> digits.as(:supp_year_end)).as(:supplement_date_range) |
1081
+ # With base identifier + supplement
1082
+ (
1083
+ # Capture base portion (everything before "supp" or "sup" or slash+year)
1084
+ (
1085
+ # Number with edition: "101e2"
1086
+ (digits.as(:base_number) >> str("e") >> digits.as(:edition_number)) |
1087
+ # Number with revision (for supplement patterns): "145r11"
1088
+ (digits.as(:base_number) >> lower_letter.as(:revision_letter) >> digits.as(:revision_number)) |
1089
+ # Number with letter suffix: "378G"
1090
+ (digits.as(:base_number) >> upper_letter.as(:letter_suffix)) |
1091
+ # Just number: "25", "24"
1092
+ digits.as(:simple_number)
1093
+ ).as(:base_portion) >>
1094
+ # Supplement marker - support both "supp" and "sup", OR implicit supplement via slash+year
1095
+ (
1096
+ # Explicit supplement marker
1097
+ ((str("supp") | str("sup")) >>
1098
+ # Optional supplement metadata
1099
+ (
1100
+ (month_abbrev >> digits).as(:supplement_month_year) |
1101
+ # Dash + number + slash + year (e.g., supp-12/1926)
1102
+ (dash >> digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_dash_slash_year) |
1103
+ (dash >> digits.as(:supplement_year)) |
1104
+ (digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_slash_year) |
1105
+ str("").as(:supplement_empty)
1106
+ ).maybe) |
1107
+ # Implicit supplement via slash+year (e.g., "145r11/1925")
1108
+ (slash >> digits.as(:implicit_supplement_year)).as(:implicit_supplement)
1109
+ )
1110
+ )
1111
+ )
1112
+ end
1113
+
1114
+ root(:identifier)
1115
+ end
1116
+ end
1117
+ end