pubid 1.15.17 → 2.0.0.pre.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (601) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +1 -1
  3. data/README.adoc +2041 -53
  4. data/archived-gems/pubid-ccsds/update_codes.yaml +1 -0
  5. data/archived-gems/pubid-iec/stages.yaml +129 -0
  6. data/archived-gems/pubid-iec/update_codes.yaml +67 -0
  7. data/archived-gems/pubid-ieee/update_codes.yaml +104 -0
  8. data/archived-gems/pubid-iso/stages.yaml +106 -0
  9. data/archived-gems/pubid-iso/update_codes.yaml +4 -0
  10. data/archived-gems/pubid-itu/i18n.yaml +13 -0
  11. data/archived-gems/pubid-itu/series.yaml +42 -0
  12. data/archived-gems/pubid-nist/publishers.yaml +6 -0
  13. data/archived-gems/pubid-nist/series.yaml +121 -0
  14. data/archived-gems/pubid-nist/stages.yaml +16 -0
  15. data/archived-gems/pubid-nist/update_codes.yaml +93 -0
  16. data/archived-gems/pubid-plateau/update_codes.yaml +6 -0
  17. data/data/ccsds/update_codes.yaml +1 -0
  18. data/data/iec/update_codes.yaml +67 -0
  19. data/data/ieee/update_codes.yaml +104 -0
  20. data/data/iso/update_codes.yaml +21 -0
  21. data/data/nist/update_codes.yaml +87 -0
  22. data/data/plateau/update_codes.yaml +6 -0
  23. data/lib/pubid/amca/builder.rb +176 -0
  24. data/lib/pubid/amca/identifier.rb +18 -0
  25. data/lib/pubid/amca/identifiers/base.rb +64 -0
  26. data/lib/pubid/amca/identifiers/interpretation.rb +51 -0
  27. data/lib/pubid/amca/identifiers/publication.rb +47 -0
  28. data/lib/pubid/amca/identifiers/standard.rb +22 -0
  29. data/lib/pubid/amca/identifiers.rb +12 -0
  30. data/lib/pubid/amca/parser.rb +153 -0
  31. data/lib/pubid/amca/scheme.rb +16 -0
  32. data/lib/pubid/amca/single_identifier.rb +33 -0
  33. data/lib/pubid/amca/urn_generator.rb +50 -0
  34. data/lib/pubid/amca.rb +26 -0
  35. data/lib/pubid/ansi/builder.rb +52 -0
  36. data/lib/pubid/ansi/identifier.rb +13 -0
  37. data/lib/pubid/ansi/identifiers/american_national_standard.rb +12 -0
  38. data/lib/pubid/ansi/identifiers/standard.rb +16 -0
  39. data/lib/pubid/ansi/identifiers.rb +11 -0
  40. data/lib/pubid/ansi/parser.rb +91 -0
  41. data/lib/pubid/ansi/scheme.rb +15 -0
  42. data/lib/pubid/ansi/single_identifier.rb +45 -0
  43. data/lib/pubid/ansi/urn_generator.rb +76 -0
  44. data/lib/pubid/ansi.rb +27 -0
  45. data/lib/pubid/api/builder.rb +85 -0
  46. data/lib/pubid/api/components/code.rb +9 -0
  47. data/lib/pubid/api/identifier.rb +21 -0
  48. data/lib/pubid/api/identifiers/base.rb +24 -0
  49. data/lib/pubid/api/identifiers/bulletin.rb +15 -0
  50. data/lib/pubid/api/identifiers/continuous_operations_standard.rb +15 -0
  51. data/lib/pubid/api/identifiers/mpms.rb +44 -0
  52. data/lib/pubid/api/identifiers/publication.rb +15 -0
  53. data/lib/pubid/api/identifiers/recommended_practice.rb +15 -0
  54. data/lib/pubid/api/identifiers/specification.rb +15 -0
  55. data/lib/pubid/api/identifiers/standard.rb +15 -0
  56. data/lib/pubid/api/identifiers/technical_report.rb +15 -0
  57. data/lib/pubid/api/identifiers/typeless_standard.rb +27 -0
  58. data/lib/pubid/api/parser.rb +140 -0
  59. data/lib/pubid/api/scheme.rb +66 -0
  60. data/lib/pubid/api/single_identifier.rb +46 -0
  61. data/lib/pubid/api/urn_generator.rb +41 -0
  62. data/lib/pubid/api.rb +17 -0
  63. data/lib/pubid/ashrae/builder.rb +498 -0
  64. data/lib/pubid/ashrae/identifier.rb +18 -0
  65. data/lib/pubid/ashrae/identifiers/addenda_package.rb +46 -0
  66. data/lib/pubid/ashrae/identifiers/addendum.rb +55 -0
  67. data/lib/pubid/ashrae/identifiers/base.rb +23 -0
  68. data/lib/pubid/ashrae/identifiers/combined_addenda.rb +51 -0
  69. data/lib/pubid/ashrae/identifiers/errata.rb +40 -0
  70. data/lib/pubid/ashrae/identifiers/guideline.rb +38 -0
  71. data/lib/pubid/ashrae/identifiers/interpretation.rb +39 -0
  72. data/lib/pubid/ashrae/identifiers/standard.rb +38 -0
  73. data/lib/pubid/ashrae/identifiers.rb +16 -0
  74. data/lib/pubid/ashrae/parser.rb +724 -0
  75. data/lib/pubid/ashrae/scheme.rb +53 -0
  76. data/lib/pubid/ashrae/single_identifier.rb +23 -0
  77. data/lib/pubid/ashrae/supplement_identifier.rb +23 -0
  78. data/lib/pubid/ashrae/urn_generator.rb +59 -0
  79. data/lib/pubid/ashrae.rb +21 -0
  80. data/lib/pubid/asme/builder.rb +153 -0
  81. data/lib/pubid/asme/components/code.rb +18 -0
  82. data/lib/pubid/asme/identifier.rb +15 -0
  83. data/lib/pubid/asme/identifiers/base.rb +70 -0
  84. data/lib/pubid/asme/identifiers/standard.rb +12 -0
  85. data/lib/pubid/asme/identifiers.rb +10 -0
  86. data/lib/pubid/asme/parser.rb +308 -0
  87. data/lib/pubid/asme/scheme.rb +37 -0
  88. data/lib/pubid/asme/single_identifier.rb +29 -0
  89. data/lib/pubid/asme/urn_generator.rb +133 -0
  90. data/lib/pubid/asme.rb +21 -0
  91. data/lib/pubid/astm/builder.rb +159 -0
  92. data/lib/pubid/astm/components/code.rb +33 -0
  93. data/lib/pubid/astm/identifier.rb +15 -0
  94. data/lib/pubid/astm/identifiers/adjunct.rb +21 -0
  95. data/lib/pubid/astm/identifiers/base.rb +13 -0
  96. data/lib/pubid/astm/identifiers/data_series.rb +25 -0
  97. data/lib/pubid/astm/identifiers/iso_dual_published.rb +74 -0
  98. data/lib/pubid/astm/identifiers/manual.rb +40 -0
  99. data/lib/pubid/astm/identifiers/monograph.rb +25 -0
  100. data/lib/pubid/astm/identifiers/research_report.rb +18 -0
  101. data/lib/pubid/astm/identifiers/standard.rb +52 -0
  102. data/lib/pubid/astm/identifiers/technical_report.rb +23 -0
  103. data/lib/pubid/astm/identifiers/work_in_progress.rb +21 -0
  104. data/lib/pubid/astm/parser.rb +244 -0
  105. data/lib/pubid/astm/scheme.rb +55 -0
  106. data/lib/pubid/astm/single_identifier.rb +25 -0
  107. data/lib/pubid/astm/urn_generator.rb +99 -0
  108. data/lib/pubid/astm.rb +38 -0
  109. data/lib/pubid/bsi/builder.rb +1483 -0
  110. data/lib/pubid/bsi/components/code.rb +11 -0
  111. data/lib/pubid/bsi/components/date.rb +11 -0
  112. data/lib/pubid/bsi/components/publisher.rb +11 -0
  113. data/lib/pubid/bsi/components/type.rb +11 -0
  114. data/lib/pubid/bsi/identifier.rb +27 -0
  115. data/lib/pubid/bsi/identifiers/addendum_document.rb +64 -0
  116. data/lib/pubid/bsi/identifiers/adopted_european_norm.rb +95 -0
  117. data/lib/pubid/bsi/identifiers/adopted_international_standard.rb +82 -0
  118. data/lib/pubid/bsi/identifiers/aerospace_standard.rb +118 -0
  119. data/lib/pubid/bsi/identifiers/amendment.rb +40 -0
  120. data/lib/pubid/bsi/identifiers/base.rb +11 -0
  121. data/lib/pubid/bsi/identifiers/british_industrial_practice.rb +27 -0
  122. data/lib/pubid/bsi/identifiers/british_standard.rb +33 -0
  123. data/lib/pubid/bsi/identifiers/bundled_identifier.rb +114 -0
  124. data/lib/pubid/bsi/identifiers/committee_document.rb +51 -0
  125. data/lib/pubid/bsi/identifiers/consolidated_identifier.rb +152 -0
  126. data/lib/pubid/bsi/identifiers/corrigendum.rb +28 -0
  127. data/lib/pubid/bsi/identifiers/detailed_specification.rb +69 -0
  128. data/lib/pubid/bsi/identifiers/disc.rb +56 -0
  129. data/lib/pubid/bsi/identifiers/draft_document.rb +71 -0
  130. data/lib/pubid/bsi/identifiers/electronic_book.rb +52 -0
  131. data/lib/pubid/bsi/identifiers/expert_commentary.rb +47 -0
  132. data/lib/pubid/bsi/identifiers/explanatory_supplement.rb +82 -0
  133. data/lib/pubid/bsi/identifiers/flex.rb +61 -0
  134. data/lib/pubid/bsi/identifiers/handbook.rb +39 -0
  135. data/lib/pubid/bsi/identifiers/index.rb +62 -0
  136. data/lib/pubid/bsi/identifiers/method.rb +76 -0
  137. data/lib/pubid/bsi/identifiers/national_annex.rb +73 -0
  138. data/lib/pubid/bsi/identifiers/practice_guide.rb +27 -0
  139. data/lib/pubid/bsi/identifiers/publicly_available_specification.rb +79 -0
  140. data/lib/pubid/bsi/identifiers/published_document.rb +79 -0
  141. data/lib/pubid/bsi/identifiers/section.rb +62 -0
  142. data/lib/pubid/bsi/identifiers/set.rb +46 -0
  143. data/lib/pubid/bsi/identifiers/standalone_amendment.rb +40 -0
  144. data/lib/pubid/bsi/identifiers/supplement_document.rb +51 -0
  145. data/lib/pubid/bsi/identifiers/supplementary_index.rb +81 -0
  146. data/lib/pubid/bsi/identifiers/technical_specification.rb +79 -0
  147. data/lib/pubid/bsi/identifiers/test_method.rb +67 -0
  148. data/lib/pubid/bsi/identifiers/value_added_publication.rb +52 -0
  149. data/lib/pubid/bsi/identifiers.rb +52 -0
  150. data/lib/pubid/bsi/model.rb +196 -0
  151. data/lib/pubid/bsi/parser.rb +659 -0
  152. data/lib/pubid/bsi/scheme.rb +243 -0
  153. data/lib/pubid/bsi/single_identifier.rb +129 -0
  154. data/lib/pubid/bsi/urn_generator.rb +84 -0
  155. data/lib/pubid/bsi.rb +32 -0
  156. data/lib/pubid/builder/base.rb +138 -0
  157. data/lib/pubid/bundled_identifier.rb +126 -0
  158. data/lib/pubid/ccsds/builder.rb +56 -0
  159. data/lib/pubid/ccsds/identifier.rb +16 -0
  160. data/lib/pubid/ccsds/identifiers/base.rb +78 -0
  161. data/lib/pubid/ccsds/identifiers/base_BASE_88929.rb +70 -0
  162. data/lib/pubid/ccsds/identifiers/corrigendum.rb +39 -0
  163. data/lib/pubid/ccsds/identifiers.rb +10 -0
  164. data/lib/pubid/ccsds/parser.rb +71 -0
  165. data/lib/pubid/ccsds/scheme.rb +57 -0
  166. data/lib/pubid/ccsds/single_identifier.rb +74 -0
  167. data/lib/pubid/ccsds/supplement_identifier.rb +33 -0
  168. data/lib/pubid/ccsds/urn_generator.rb +115 -0
  169. data/lib/pubid/ccsds.rb +21 -0
  170. data/lib/pubid/cen_cenelec/builder.rb +330 -0
  171. data/lib/pubid/cen_cenelec/identifier.rb +15 -0
  172. data/lib/pubid/cen_cenelec/identifiers/adopted_european_norm.rb +40 -0
  173. data/lib/pubid/cen_cenelec/identifiers/amendment.rb +29 -0
  174. data/lib/pubid/cen_cenelec/identifiers/base.rb +75 -0
  175. data/lib/pubid/cen_cenelec/identifiers/cen_report.rb +28 -0
  176. data/lib/pubid/cen_cenelec/identifiers/cen_workshop_agreement.rb +27 -0
  177. data/lib/pubid/cen_cenelec/identifiers/cenelec_harmonization_document.rb +28 -0
  178. data/lib/pubid/cen_cenelec/identifiers/consolidated_identifier.rb +61 -0
  179. data/lib/pubid/cen_cenelec/identifiers/corrigendum.rb +35 -0
  180. data/lib/pubid/cen_cenelec/identifiers/european_norm.rb +41 -0
  181. data/lib/pubid/cen_cenelec/identifiers/european_prestandard.rb +37 -0
  182. data/lib/pubid/cen_cenelec/identifiers/european_specification.rb +28 -0
  183. data/lib/pubid/cen_cenelec/identifiers/fragment.rb +22 -0
  184. data/lib/pubid/cen_cenelec/identifiers/guide.rb +27 -0
  185. data/lib/pubid/cen_cenelec/identifiers/harmonization_document.rb +27 -0
  186. data/lib/pubid/cen_cenelec/identifiers/technical_report.rb +27 -0
  187. data/lib/pubid/cen_cenelec/identifiers/technical_specification.rb +35 -0
  188. data/lib/pubid/cen_cenelec/identifiers.rb +32 -0
  189. data/lib/pubid/cen_cenelec/parser.rb +144 -0
  190. data/lib/pubid/cen_cenelec/scheme.rb +164 -0
  191. data/lib/pubid/cen_cenelec/single_identifier.rb +130 -0
  192. data/lib/pubid/cen_cenelec/supplement_identifier.rb +48 -0
  193. data/lib/pubid/cen_cenelec/urn_generator.rb +129 -0
  194. data/lib/pubid/cen_cenelec.rb +21 -0
  195. data/lib/pubid/cie/builder.rb +399 -0
  196. data/lib/pubid/cie/components/code.rb +72 -0
  197. data/lib/pubid/cie/components/language.rb +58 -0
  198. data/lib/pubid/cie/identifier.rb +18 -0
  199. data/lib/pubid/cie/identifiers/bundle.rb +20 -0
  200. data/lib/pubid/cie/identifiers/conference.rb +32 -0
  201. data/lib/pubid/cie/identifiers/corrigendum.rb +40 -0
  202. data/lib/pubid/cie/identifiers/dual_published.rb +41 -0
  203. data/lib/pubid/cie/identifiers/identical.rb +64 -0
  204. data/lib/pubid/cie/identifiers/joint_published.rb +52 -0
  205. data/lib/pubid/cie/identifiers/standard.rb +58 -0
  206. data/lib/pubid/cie/identifiers/supplement.rb +45 -0
  207. data/lib/pubid/cie/identifiers/tutorial_bundle.rb +20 -0
  208. data/lib/pubid/cie/identifiers.rb +17 -0
  209. data/lib/pubid/cie/parser.rb +347 -0
  210. data/lib/pubid/cie/scheme.rb +64 -0
  211. data/lib/pubid/cie/single_identifier.rb +30 -0
  212. data/lib/pubid/cie/supplement_identifier.rb +26 -0
  213. data/lib/pubid/cie/urn_generator.rb +123 -0
  214. data/lib/pubid/cie.rb +28 -0
  215. data/lib/pubid/components/code.rb +33 -0
  216. data/lib/pubid/components/date.rb +49 -0
  217. data/lib/pubid/components/edition.rb +32 -0
  218. data/lib/pubid/components/language.rb +37 -0
  219. data/lib/pubid/components/locality.rb +10 -0
  220. data/lib/pubid/components/publisher.rb +36 -0
  221. data/lib/pubid/components/stage.rb +54 -0
  222. data/lib/pubid/components/type.rb +58 -0
  223. data/lib/pubid/components/typed_stage.rb +55 -0
  224. data/lib/pubid/components.rb +15 -0
  225. data/lib/pubid/core/pattern_doc_generator.rb +272 -0
  226. data/lib/pubid/core/update_codes.rb +77 -0
  227. data/lib/pubid/core.rb +8 -0
  228. data/lib/pubid/csa/builder.rb +671 -0
  229. data/lib/pubid/csa/components/code.rb +9 -0
  230. data/lib/pubid/csa/components.rb +9 -0
  231. data/lib/pubid/csa/composite_identifier.rb +27 -0
  232. data/lib/pubid/csa/identifier.rb +457 -0
  233. data/lib/pubid/csa/identifiers/base.rb +133 -0
  234. data/lib/pubid/csa/identifiers/bundled.rb +125 -0
  235. data/lib/pubid/csa/identifiers/canadian_adopted.rb +82 -0
  236. data/lib/pubid/csa/identifiers/cec.rb +129 -0
  237. data/lib/pubid/csa/identifiers/combined.rb +130 -0
  238. data/lib/pubid/csa/identifiers/csa_adopted.rb +78 -0
  239. data/lib/pubid/csa/identifiers/package.rb +65 -0
  240. data/lib/pubid/csa/identifiers/series.rb +127 -0
  241. data/lib/pubid/csa/identifiers/standard.rb +10 -0
  242. data/lib/pubid/csa/identifiers.rb +17 -0
  243. data/lib/pubid/csa/parser.rb +445 -0
  244. data/lib/pubid/csa/scheme.rb +44 -0
  245. data/lib/pubid/csa/single_identifier.rb +30 -0
  246. data/lib/pubid/csa/urn_generator.rb +80 -0
  247. data/lib/pubid/csa/wrapper_identifier.rb +31 -0
  248. data/lib/pubid/csa.rb +25 -0
  249. data/lib/pubid/etsi/builder.rb +133 -0
  250. data/lib/pubid/etsi/components/code.rb +42 -0
  251. data/lib/pubid/etsi/components/version.rb +37 -0
  252. data/lib/pubid/etsi/components.rb +10 -0
  253. data/lib/pubid/etsi/identifier.rb +14 -0
  254. data/lib/pubid/etsi/identifiers/amendment.rb +15 -0
  255. data/lib/pubid/etsi/identifiers/base.rb +38 -0
  256. data/lib/pubid/etsi/identifiers/corrigendum.rb +15 -0
  257. data/lib/pubid/etsi/identifiers/etsi_standard.rb +19 -0
  258. data/lib/pubid/etsi/identifiers/supplement_identifier.rb +91 -0
  259. data/lib/pubid/etsi/identifiers.rb +14 -0
  260. data/lib/pubid/etsi/parser.rb +133 -0
  261. data/lib/pubid/etsi/scheme.rb +42 -0
  262. data/lib/pubid/etsi/urn_generator.rb +76 -0
  263. data/lib/pubid/etsi.rb +21 -0
  264. data/lib/pubid/export/auditor.rb +89 -0
  265. data/lib/pubid/export/data_class_exporter.rb +59 -0
  266. data/lib/pubid/export/exporter.rb +74 -0
  267. data/lib/pubid/export/flavor_exporter.rb +402 -0
  268. data/lib/pubid/export/ieee_exporter.rb +78 -0
  269. data/lib/pubid/export/itu_exporter.rb +66 -0
  270. data/lib/pubid/export/nist_exporter.rb +64 -0
  271. data/lib/pubid/export/registry_exporter.rb +90 -0
  272. data/lib/pubid/export/result.rb +97 -0
  273. data/lib/pubid/export/scheme_exporter.rb +70 -0
  274. data/lib/pubid/export.rb +18 -0
  275. data/lib/pubid/format_detector.rb +16 -0
  276. data/lib/pubid/format_registry.rb +42 -0
  277. data/lib/pubid/identifier.rb +235 -0
  278. data/lib/pubid/identifier_metadata.rb +148 -0
  279. data/lib/pubid/identifier_registry.rb +198 -0
  280. data/lib/pubid/idf/builder.rb +82 -0
  281. data/lib/pubid/idf/identifier.rb +69 -0
  282. data/lib/pubid/idf/identifiers/amendment.rb +27 -0
  283. data/lib/pubid/idf/identifiers/corrigendum.rb +27 -0
  284. data/lib/pubid/idf/identifiers/international_standard.rb +123 -0
  285. data/lib/pubid/idf/identifiers/reviewed_method.rb +100 -0
  286. data/lib/pubid/idf/identifiers.rb +13 -0
  287. data/lib/pubid/idf/parser.rb +143 -0
  288. data/lib/pubid/idf/scheme.rb +61 -0
  289. data/lib/pubid/idf/single_identifier.rb +19 -0
  290. data/lib/pubid/idf/supplement_identifier.rb +43 -0
  291. data/lib/pubid/idf/urn_generator.rb +84 -0
  292. data/lib/pubid/idf.rb +25 -0
  293. data/lib/pubid/iec/builder.rb +457 -0
  294. data/lib/pubid/iec/components/code.rb +59 -0
  295. data/lib/pubid/iec/components/consolidated_amendment.rb +59 -0
  296. data/lib/pubid/iec/components/publisher.rb +35 -0
  297. data/lib/pubid/iec/components/sheet.rb +32 -0
  298. data/lib/pubid/iec/components/trf_info.rb +38 -0
  299. data/lib/pubid/iec/components/vap_suffix.rb +41 -0
  300. data/lib/pubid/iec/identifier.rb +21 -0
  301. data/lib/pubid/iec/identifiers/amendment.rb +94 -0
  302. data/lib/pubid/iec/identifiers/base.rb +78 -0
  303. data/lib/pubid/iec/identifiers/component_specification.rb +39 -0
  304. data/lib/pubid/iec/identifiers/conformity_assessment.rb +39 -0
  305. data/lib/pubid/iec/identifiers/consolidated_identifier.rb +86 -0
  306. data/lib/pubid/iec/identifiers/corrigendum.rb +94 -0
  307. data/lib/pubid/iec/identifiers/fragment_identifier.rb +141 -0
  308. data/lib/pubid/iec/identifiers/guide.rb +104 -0
  309. data/lib/pubid/iec/identifiers/international_standard.rb +147 -0
  310. data/lib/pubid/iec/identifiers/interpretation_sheet.rb +104 -0
  311. data/lib/pubid/iec/identifiers/operational_document.rb +39 -0
  312. data/lib/pubid/iec/identifiers/publicly_available_specification.rb +101 -0
  313. data/lib/pubid/iec/identifiers/sheet_identifier.rb +66 -0
  314. data/lib/pubid/iec/identifiers/societal_technology_trend_report.rb +40 -0
  315. data/lib/pubid/iec/identifiers/systems_reference_document.rb +40 -0
  316. data/lib/pubid/iec/identifiers/technical_report.rb +132 -0
  317. data/lib/pubid/iec/identifiers/technical_specification.rb +132 -0
  318. data/lib/pubid/iec/identifiers/technology_report.rb +39 -0
  319. data/lib/pubid/iec/identifiers/test_report_form.rb +78 -0
  320. data/lib/pubid/iec/identifiers/vap_identifier.rb +77 -0
  321. data/lib/pubid/iec/identifiers/white_paper.rb +39 -0
  322. data/lib/pubid/iec/identifiers/working_document.rb +96 -0
  323. data/lib/pubid/iec/parser.rb +412 -0
  324. data/lib/pubid/iec/rendering_style.rb +113 -0
  325. data/lib/pubid/iec/scheme.rb +71 -0
  326. data/lib/pubid/iec/single_identifier.rb +80 -0
  327. data/lib/pubid/iec/supplement_identifier.rb +161 -0
  328. data/lib/pubid/iec/urn_generator.rb +193 -0
  329. data/lib/pubid/iec/urn_parser.rb +289 -0
  330. data/lib/pubid/iec.rb +85 -0
  331. data/lib/pubid/ieee/aiee/builder.rb +71 -0
  332. data/lib/pubid/ieee/aiee/identifier.rb +105 -0
  333. data/lib/pubid/ieee/aiee/parser.rb +130 -0
  334. data/lib/pubid/ieee/aiee.rb +11 -0
  335. data/lib/pubid/ieee/builder.rb +1237 -0
  336. data/lib/pubid/ieee/components/code.rb +102 -0
  337. data/lib/pubid/ieee/components/draft.rb +93 -0
  338. data/lib/pubid/ieee/components/relationship.rb +157 -0
  339. data/lib/pubid/ieee/components/typed_stage.rb +100 -0
  340. data/lib/pubid/ieee/identifier.rb +13 -0
  341. data/lib/pubid/ieee/identifiers/adopted_standard.rb +33 -0
  342. data/lib/pubid/ieee/identifiers/base.rb +591 -0
  343. data/lib/pubid/ieee/identifiers/conformance_identifier.rb +35 -0
  344. data/lib/pubid/ieee/identifiers/corrigendum.rb +37 -0
  345. data/lib/pubid/ieee/identifiers/csa_dual_published.rb +51 -0
  346. data/lib/pubid/ieee/identifiers/dual_identifier.rb +18 -0
  347. data/lib/pubid/ieee/identifiers/dual_published.rb +28 -0
  348. data/lib/pubid/ieee/identifiers/iec_ieee_copublished.rb +27 -0
  349. data/lib/pubid/ieee/identifiers/interpretation_identifier.rb +34 -0
  350. data/lib/pubid/ieee/identifiers/joint_development.rb +172 -0
  351. data/lib/pubid/ieee/identifiers/multi_numbered_identifier.rb +51 -0
  352. data/lib/pubid/ieee/identifiers/nesc/base.rb +56 -0
  353. data/lib/pubid/ieee/identifiers/nesc/draft.rb +28 -0
  354. data/lib/pubid/ieee/identifiers/nesc/handbook.rb +32 -0
  355. data/lib/pubid/ieee/identifiers/nesc/redline.rb +26 -0
  356. data/lib/pubid/ieee/identifiers/nesc/standard.rb +26 -0
  357. data/lib/pubid/ieee/identifiers/nesc.rb +15 -0
  358. data/lib/pubid/ieee/identifiers/parenthetical_identifier.rb +20 -0
  359. data/lib/pubid/ieee/identifiers/project_draft_identifier.rb +26 -0
  360. data/lib/pubid/ieee/identifiers/redlined_standard.rb +33 -0
  361. data/lib/pubid/ieee/identifiers/si_standard.rb +73 -0
  362. data/lib/pubid/ieee/identifiers/standard.rb +41 -0
  363. data/lib/pubid/ieee/identifiers/supplement_identifier.rb +23 -0
  364. data/lib/pubid/ieee/identifiers.rb +33 -0
  365. data/lib/pubid/ieee/ire/builder.rb +61 -0
  366. data/lib/pubid/ieee/ire/identifier.rb +58 -0
  367. data/lib/pubid/ieee/ire/parser.rb +91 -0
  368. data/lib/pubid/ieee/ire.rb +11 -0
  369. data/lib/pubid/ieee/nesc/builder.rb +101 -0
  370. data/lib/pubid/ieee/nesc/parser.rb +154 -0
  371. data/lib/pubid/ieee/nesc.rb +10 -0
  372. data/lib/pubid/ieee/parser.rb +1226 -0
  373. data/lib/pubid/ieee/scheme.rb +90 -0
  374. data/lib/pubid/ieee/typed_stages.rb +172 -0
  375. data/lib/pubid/ieee/urn_generator.rb +188 -0
  376. data/lib/pubid/ieee.rb +32 -0
  377. data/lib/pubid/ieee_debug.rb +31 -0
  378. data/lib/pubid/iho/builder.rb +37 -0
  379. data/lib/pubid/iho/identifier.rb +19 -0
  380. data/lib/pubid/iho/identifiers/base.rb +41 -0
  381. data/lib/pubid/iho/identifiers/bibliographic.rb +20 -0
  382. data/lib/pubid/iho/identifiers/circular_letter.rb +19 -0
  383. data/lib/pubid/iho/identifiers/miscellaneous.rb +20 -0
  384. data/lib/pubid/iho/identifiers/publication.rb +19 -0
  385. data/lib/pubid/iho/identifiers/standard.rb +19 -0
  386. data/lib/pubid/iho/identifiers.rb +14 -0
  387. data/lib/pubid/iho/parser.rb +68 -0
  388. data/lib/pubid/iho/scheme.rb +29 -0
  389. data/lib/pubid/iho/urn_generator.rb +29 -0
  390. data/lib/pubid/iho.rb +21 -0
  391. data/lib/pubid/iso/builder.rb +305 -0
  392. data/lib/pubid/iso/bundled_identifier.rb +85 -0
  393. data/lib/pubid/iso/combined_identifier.rb +22 -0
  394. data/lib/pubid/iso/components/code.rb +36 -0
  395. data/lib/pubid/iso/components/publisher.rb +60 -0
  396. data/lib/pubid/iso/components.rb +12 -0
  397. data/lib/pubid/iso/format_resolver.rb +45 -0
  398. data/lib/pubid/iso/identifier.rb +69 -0
  399. data/lib/pubid/iso/identifiers/addendum.rb +104 -0
  400. data/lib/pubid/iso/identifiers/amendment.rb +128 -0
  401. data/lib/pubid/iso/identifiers/base.rb +115 -0
  402. data/lib/pubid/iso/identifiers/corrigendum.rb +108 -0
  403. data/lib/pubid/iso/identifiers/data.rb +76 -0
  404. data/lib/pubid/iso/identifiers/directives.rb +59 -0
  405. data/lib/pubid/iso/identifiers/directives_supplement.rb +119 -0
  406. data/lib/pubid/iso/identifiers/extract.rb +30 -0
  407. data/lib/pubid/iso/identifiers/guide.rb +100 -0
  408. data/lib/pubid/iso/identifiers/international_standard.rb +168 -0
  409. data/lib/pubid/iso/identifiers/international_standardized_profile.rb +94 -0
  410. data/lib/pubid/iso/identifiers/international_workshop_agreement.rb +89 -0
  411. data/lib/pubid/iso/identifiers/pas.rb +93 -0
  412. data/lib/pubid/iso/identifiers/recommendation.rb +45 -0
  413. data/lib/pubid/iso/identifiers/supplement.rb +87 -0
  414. data/lib/pubid/iso/identifiers/tc_document.rb +108 -0
  415. data/lib/pubid/iso/identifiers/technical_report.rb +103 -0
  416. data/lib/pubid/iso/identifiers/technical_specification.rb +102 -0
  417. data/lib/pubid/iso/identifiers/technology_trends_assessments.rb +95 -0
  418. data/lib/pubid/iso/identifiers.rb +33 -0
  419. data/lib/pubid/iso/parser.rb +510 -0
  420. data/lib/pubid/iso/rendering_style.rb +120 -0
  421. data/lib/pubid/iso/scheme.rb +187 -0
  422. data/lib/pubid/iso/single_identifier.rb +61 -0
  423. data/lib/pubid/iso/supplement_identifier.rb +27 -0
  424. data/lib/pubid/iso/urn_generator.rb +412 -0
  425. data/lib/pubid/iso/urn_parser.rb +423 -0
  426. data/lib/pubid/iso/utilities.rb +86 -0
  427. data/lib/pubid/iso.rb +50 -0
  428. data/lib/pubid/itu/builder.rb +171 -0
  429. data/lib/pubid/itu/components/code.rb +39 -0
  430. data/lib/pubid/itu/components/sector.rb +35 -0
  431. data/lib/pubid/itu/components/series.rb +29 -0
  432. data/lib/pubid/itu/i18n.rb +9 -0
  433. data/lib/pubid/itu/i18n.yaml +30 -0
  434. data/lib/pubid/itu/identifier.rb +53 -0
  435. data/lib/pubid/itu/identifiers/amendment.rb +43 -0
  436. data/lib/pubid/itu/identifiers/annex.rb +74 -0
  437. data/lib/pubid/itu/identifiers/base.rb +154 -0
  438. data/lib/pubid/itu/identifiers/combined_identifier.rb +47 -0
  439. data/lib/pubid/itu/identifiers/corrigendum.rb +44 -0
  440. data/lib/pubid/itu/identifiers/recommendation.rb +16 -0
  441. data/lib/pubid/itu/identifiers/special_publication.rb +31 -0
  442. data/lib/pubid/itu/identifiers/supplement.rb +46 -0
  443. data/lib/pubid/itu/identifiers.rb +16 -0
  444. data/lib/pubid/itu/model.rb +111 -0
  445. data/lib/pubid/itu/parser.rb +225 -0
  446. data/lib/pubid/itu/scheme.rb +174 -0
  447. data/lib/pubid/itu/urn_generator.rb +105 -0
  448. data/lib/pubid/itu.rb +22 -0
  449. data/lib/pubid/jcgm/builder.rb +88 -0
  450. data/lib/pubid/jcgm/components/publisher.rb +20 -0
  451. data/lib/pubid/jcgm/components.rb +9 -0
  452. data/lib/pubid/jcgm/identifier.rb +11 -0
  453. data/lib/pubid/jcgm/identifiers/amendment.rb +35 -0
  454. data/lib/pubid/jcgm/identifiers/guide.rb +21 -0
  455. data/lib/pubid/jcgm/identifiers/gum_guide.rb +51 -0
  456. data/lib/pubid/jcgm/identifiers.rb +11 -0
  457. data/lib/pubid/jcgm/parser.rb +84 -0
  458. data/lib/pubid/jcgm/scheme.rb +60 -0
  459. data/lib/pubid/jcgm/single_identifier.rb +48 -0
  460. data/lib/pubid/jcgm/supplement_identifier.rb +16 -0
  461. data/lib/pubid/jcgm/urn_generator.rb +110 -0
  462. data/lib/pubid/jcgm.rb +31 -0
  463. data/lib/pubid/jis/builder.rb +124 -0
  464. data/lib/pubid/jis/components/code.rb +59 -0
  465. data/lib/pubid/jis/components.rb +9 -0
  466. data/lib/pubid/jis/identifier.rb +18 -0
  467. data/lib/pubid/jis/identifiers/amendment.rb +16 -0
  468. data/lib/pubid/jis/identifiers/base.rb +72 -0
  469. data/lib/pubid/jis/identifiers/explanation.rb +22 -0
  470. data/lib/pubid/jis/identifiers/japanese_industrial_standard.rb +16 -0
  471. data/lib/pubid/jis/identifiers/standard.rb +27 -0
  472. data/lib/pubid/jis/identifiers/technical_report.rb +31 -0
  473. data/lib/pubid/jis/identifiers/technical_specification.rb +31 -0
  474. data/lib/pubid/jis/identifiers.rb +17 -0
  475. data/lib/pubid/jis/parser.rb +109 -0
  476. data/lib/pubid/jis/scheme.rb +49 -0
  477. data/lib/pubid/jis/single_identifier.rb +37 -0
  478. data/lib/pubid/jis/supplement_identifier.rb +47 -0
  479. data/lib/pubid/jis/urn_generator.rb +25 -0
  480. data/lib/pubid/jis.rb +23 -0
  481. data/lib/pubid/lutaml/no_store_registration.rb +30 -0
  482. data/lib/pubid/nist/builder.rb +2100 -0
  483. data/lib/pubid/nist/components/code.rb +38 -0
  484. data/lib/pubid/nist/components/edition.rb +118 -0
  485. data/lib/pubid/nist/components/issue_number.rb +28 -0
  486. data/lib/pubid/nist/components/part.rb +77 -0
  487. data/lib/pubid/nist/components/publisher.rb +24 -0
  488. data/lib/pubid/nist/components/stage.rb +53 -0
  489. data/lib/pubid/nist/components/supplement.rb +121 -0
  490. data/lib/pubid/nist/components/translation.rb +42 -0
  491. data/lib/pubid/nist/components/update.rb +103 -0
  492. data/lib/pubid/nist/components/version.rb +35 -0
  493. data/lib/pubid/nist/components/volume.rb +32 -0
  494. data/lib/pubid/nist/components.rb +19 -0
  495. data/lib/pubid/nist/configuration.rb +77 -0
  496. data/lib/pubid/nist/identifiers/base.rb +499 -0
  497. data/lib/pubid/nist/identifiers/circular.rb +68 -0
  498. data/lib/pubid/nist/identifiers/circular_supplement.rb +50 -0
  499. data/lib/pubid/nist/identifiers/commercial_standard.rb +41 -0
  500. data/lib/pubid/nist/identifiers/commercial_standard_emergency.rb +56 -0
  501. data/lib/pubid/nist/identifiers/commercial_standards_monthly.rb +56 -0
  502. data/lib/pubid/nist/identifiers/crpl_report.rb +135 -0
  503. data/lib/pubid/nist/identifiers/federal_information_processing_standards.rb +94 -0
  504. data/lib/pubid/nist/identifiers/grant_contractor_report.rb +35 -0
  505. data/lib/pubid/nist/identifiers/handbook.rb +50 -0
  506. data/lib/pubid/nist/identifiers/internal_report.rb +56 -0
  507. data/lib/pubid/nist/identifiers/letter_circular.rb +45 -0
  508. data/lib/pubid/nist/identifiers/miscellaneous_publication.rb +65 -0
  509. data/lib/pubid/nist/identifiers/monograph.rb +69 -0
  510. data/lib/pubid/nist/identifiers/ncstar.rb +41 -0
  511. data/lib/pubid/nist/identifiers/nsrds.rb +41 -0
  512. data/lib/pubid/nist/identifiers/owmwp.rb +35 -0
  513. data/lib/pubid/nist/identifiers/report.rb +68 -0
  514. data/lib/pubid/nist/identifiers/special_publication.rb +36 -0
  515. data/lib/pubid/nist/identifiers/technical_note.rb +90 -0
  516. data/lib/pubid/nist/identifiers.rb +33 -0
  517. data/lib/pubid/nist/parser.rb +1084 -0
  518. data/lib/pubid/nist/scheme.rb +199 -0
  519. data/lib/pubid/nist/supplement_identifier.rb +83 -0
  520. data/lib/pubid/nist/urn_generator.rb +127 -0
  521. data/lib/pubid/nist.rb +36 -0
  522. data/lib/pubid/oiml/builder.rb +189 -0
  523. data/lib/pubid/oiml/components/code.rb +20 -0
  524. data/lib/pubid/oiml/components.rb +9 -0
  525. data/lib/pubid/oiml/identifier.rb +11 -0
  526. data/lib/pubid/oiml/identifiers/amendment.rb +13 -0
  527. data/lib/pubid/oiml/identifiers/annex.rb +62 -0
  528. data/lib/pubid/oiml/identifiers/base.rb +36 -0
  529. data/lib/pubid/oiml/identifiers/basic_publication.rb +13 -0
  530. data/lib/pubid/oiml/identifiers/document.rb +13 -0
  531. data/lib/pubid/oiml/identifiers/expert_report.rb +13 -0
  532. data/lib/pubid/oiml/identifiers/guide.rb +13 -0
  533. data/lib/pubid/oiml/identifiers/recommendation.rb +13 -0
  534. data/lib/pubid/oiml/identifiers/seminar_report.rb +13 -0
  535. data/lib/pubid/oiml/identifiers/vocabulary.rb +13 -0
  536. data/lib/pubid/oiml/identifiers.rb +18 -0
  537. data/lib/pubid/oiml/parser.rb +173 -0
  538. data/lib/pubid/oiml/scheme.rb +46 -0
  539. data/lib/pubid/oiml/single_identifier.rb +90 -0
  540. data/lib/pubid/oiml/supplement_identifier.rb +43 -0
  541. data/lib/pubid/oiml/urn_generator.rb +64 -0
  542. data/lib/pubid/oiml.rb +26 -0
  543. data/lib/pubid/parser/common_parse_methods.rb +13 -0
  544. data/lib/pubid/parser/common_parse_rules.rb +56 -0
  545. data/lib/pubid/parser.rb +8 -0
  546. data/lib/pubid/parsers/base.rb +11 -0
  547. data/lib/pubid/parsers/mr_string.rb +93 -0
  548. data/lib/pubid/plateau/builder.rb +50 -0
  549. data/lib/pubid/plateau/identifiers/annex.rb +16 -0
  550. data/lib/pubid/plateau/identifiers/base.rb +51 -0
  551. data/lib/pubid/plateau/identifiers/handbook.rb +34 -0
  552. data/lib/pubid/plateau/identifiers/technical_report.rb +20 -0
  553. data/lib/pubid/plateau/identifiers.rb +12 -0
  554. data/lib/pubid/plateau/parser.rb +63 -0
  555. data/lib/pubid/plateau/scheme.rb +45 -0
  556. data/lib/pubid/plateau/supplement_identifier.rb +72 -0
  557. data/lib/pubid/plateau/urn_generator.rb +29 -0
  558. data/lib/pubid/plateau.rb +25 -0
  559. data/lib/pubid/renderers/base.rb +19 -0
  560. data/lib/pubid/renderers/directives_renderer.rb +62 -0
  561. data/lib/pubid/renderers/guide_renderer.rb +20 -0
  562. data/lib/pubid/renderers/human_readable.rb +58 -0
  563. data/lib/pubid/renderers/iwa_renderer.rb +16 -0
  564. data/lib/pubid/renderers/mr_string.rb +16 -0
  565. data/lib/pubid/renderers/supplement_renderer.rb +33 -0
  566. data/lib/pubid/renderers/urn.rb +11 -0
  567. data/lib/pubid/renderers.rb +14 -0
  568. data/lib/pubid/rendering/base.rb +73 -0
  569. data/lib/pubid/rendering/common.rb +211 -0
  570. data/lib/pubid/rendering/context.rb +156 -0
  571. data/lib/pubid/rendering/date.rb +27 -0
  572. data/lib/pubid/rendering/format.rb +25 -0
  573. data/lib/pubid/rendering/language.rb +21 -0
  574. data/lib/pubid/rendering/numbering.rb +24 -0
  575. data/lib/pubid/rendering/publisher.rb +25 -0
  576. data/lib/pubid/rendering/stage.rb +38 -0
  577. data/lib/pubid/rendering/supplement.rb +46 -0
  578. data/lib/pubid/rendering.rb +16 -0
  579. data/lib/pubid/sae/builder.rb +32 -0
  580. data/lib/pubid/sae/components/code.rb +9 -0
  581. data/lib/pubid/sae/components/date.rb +19 -0
  582. data/lib/pubid/sae/components/type.rb +19 -0
  583. data/lib/pubid/sae/components.rb +11 -0
  584. data/lib/pubid/sae/identifier.rb +14 -0
  585. data/lib/pubid/sae/identifiers/base.rb +42 -0
  586. data/lib/pubid/sae/identifiers.rb +9 -0
  587. data/lib/pubid/sae/parser.rb +55 -0
  588. data/lib/pubid/sae/scheme.rb +47 -0
  589. data/lib/pubid/sae/urn_generator.rb +38 -0
  590. data/lib/pubid/sae.rb +19 -0
  591. data/lib/pubid/scheme.rb +207 -0
  592. data/lib/pubid/urn_generator/base.rb +110 -0
  593. data/lib/pubid/utils/string_normalizer.rb +196 -0
  594. data/lib/pubid/utils.rb +7 -0
  595. data/lib/pubid/version.rb +3 -1
  596. data/lib/pubid.rb +137 -13
  597. data/lib/tasks/docs.rake +37 -0
  598. data/lib/tasks/export.rake +38 -0
  599. data/lib/tasks/website-data.json +7488 -0
  600. metadata +613 -171
  601. data/lib/pubid/registry.rb +0 -30
@@ -0,0 +1,1084 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "parslet"
4
+
5
+ module Pubid
6
+ module Nist
7
+ # Parser class for NIST identifiers
8
+ # Single Responsibility: Parsing NIST identifier syntax
9
+ class Parser < Parslet::Parser
10
+ # Class-level parse method with preprocessing
11
+ # Handles data quality normalization before parsing
12
+ # Named explicitly to avoid conflict with Parslet's built-in parse method
13
+ def self.class_parse_with_preprocessing(input)
14
+ # Apply legacy update_codes normalization first, before any other preprocessing
15
+ cleaned = Core::UpdateCodes.apply(input.to_s.strip, :nist)
16
+
17
+ # Fix lowercase publisher at start
18
+ cleaned = cleaned.sub(/^nbs\b/i, "NBS")
19
+ cleaned = cleaned.sub(/^nist\b/i, "NIST")
20
+
21
+ # Fix publisher+series concatenation: "NISTIR" → "NIST IR", "NBSIR" → "NBS IR"
22
+ # Must come after lowercase publisher fix to catch "nistir" → "NISTIR" → "NIST IR"
23
+ cleaned = cleaned.gsub(
24
+ /^(NBS|NIST)(IR|FIPS|GCR|HB|MONO|MP|NCSTAR|NSRDS)/i, '\1 \2'
25
+ )
26
+
27
+ # Fix lowercase series (ir, sp, tn, etc.)
28
+ cleaned = cleaned.sub(/\b(ir|sp|tn|hb|fips|ams|vts)\b/i, &:upcase)
29
+
30
+ # Normalize LC to LCIRC (single definition of truth)
31
+ # Pattern: "LC" followed by space/dot/end should become "LCIRC"
32
+ # But don't change if already "LCIRC"
33
+ cleaned = cleaned.gsub(/\bLC\b(?!IRC)/, "LCIRC")
34
+
35
+ # Combine "NBS LCIRC" with space into "NBS.LCIRC" ONLY when followed by supplement marker
36
+ # This allows the circ_supplement_identifier rule to match the pattern
37
+ # Only apply to supplement cases, not regular LCIRC identifiers
38
+ cleaned = cleaned.gsub(/\bNBS LCIRC\b(?=.*\b(?:supp?|sup\+|r\d+\/)\d)/,
39
+ "NBS.LCIRC")
40
+
41
+ # Convert MR format LCIRC supplements to space-separated format
42
+ # "NBS.LCIRC.145r11/1925" → "NBS LCIRC 145r11/1925" (convert series dot to space)
43
+ cleaned = cleaned.gsub(/\bNBS\.LCIRC\.(\d+r\d+\/\d{4})/,
44
+ "NBS LCIRC \\1")
45
+ # Also handle without year: "NBS.LCIRC.145r11" → "NBS LCIRC 145r11"
46
+ cleaned = cleaned.gsub(/\bNBS\.LCIRC\.(\d+r\d+)\b/, "NBS LCIRC \\1")
47
+
48
+ # Fix Roman numerals: "1011-I-2" → keep as is, but fix spaces: "1011-I-2 0" → "1011-I-2.0"
49
+ cleaned = cleaned.gsub(/([-\d]+[IVX]+[-\d]+)\s+(\d+)/, '\1.\2')
50
+
51
+ # Fix rev without space: "126rev2013" → "126 rev2013" (separate number from rev+year)
52
+ # BUT preserve edition+revision patterns: "e2rev1908" stays as-is
53
+ cleaned = cleaned.gsub(/(?<!e)(\d)(rev\d{4})/, '\1 \2')
54
+
55
+ # Fix LCIRC revision with slash and year: "145r6/1925" → "145 r6/1925"
56
+ # BUT NOT for LCIRC series (keep "NBS LCIRC 145r11/1925" as-is for parser)
57
+ # The circ_supplement_identifier rule expects "145r11" (no space)
58
+ unless cleaned.include?("LCIRC") || cleaned.include?("CIRC")
59
+ cleaned = cleaned.gsub(/(\d)(r\d+\/\d{4})/, '\1 \2')
60
+ end
61
+
62
+ # Fix LCIRC revision with just year (no slash): "1128r1995" → "1128 r1995"
63
+ # BUT preserve edition+revision patterns: "13e2rev1908" stays as-is
64
+ # AND preserve month abbreviations in patterns like "107-Mar1985" (ar1985 contains 'r')
65
+ # Use word boundary to ensure 'r' is standalone, not part of a month name
66
+ # AND preserve "rv" (revision year) patterns: "1013rv1953" stays as-is
67
+ cleaned = cleaned.gsub(/\b(r(?!v)\d{4})\b/, ' \1')
68
+
69
+ # Fix month in revision: "4743rJun1992" → "4743 rJun1992" (NEW)
70
+ cleaned = cleaned.gsub(/(\d)(r[A-Z][a-z]{2,8}\d{4})/, '\1 \2')
71
+ # REMOVED: Revision with 1-2 digits + lowercase letter preprocessing
72
+ # This is now handled by the more comprehensive fix at lines 131-142
73
+ # which keeps "22r1a" together (no space) for second_number pattern matching
74
+
75
+ # CRITICAL: Normalize lowercase letter suffix to uppercase
76
+ # Fix dash-letter pattern: "6529-a" → "6529-A" (FIXED - was incorrect)
77
+ # BUT preserve lowercase for NCSTAR series when letter is followed by volume (e.g., "1-1av1")
78
+ cleaned = cleaned.gsub(/(\d)-([a-z])$/) { "#{$1}-#{$2.upcase}" }
79
+
80
+ # Fix direct letter suffix (no dash): "378g" → "378G", "1000a" → "1000A"
81
+ # MUST come after dash pattern to avoid conflicts
82
+ # Fix letter suffix at end: "1011-A" → "1011A", "97-3b" → "97-3B"
83
+ # CRITICAL: Exclude r+digit pattern (e.g., "73-197r", "6945r") from this conversion
84
+ # These should remain as lowercase for edition pattern matching
85
+ # Only match single letter at end, not part of words like "index", "sec", etc.
86
+ cleaned = cleaned.gsub(/(\d)([a-z&&[^r]])$/) { "#{$1}#{$2.upcase}" }
87
+ # Also fix r+letter patterns (e.g., "22r1a" → "22r1A") separately
88
+ cleaned = cleaned.gsub(/(\d)(r)(\d+)([a-z])$/) do
89
+ "#{$1}#{$2}#{$3}#{$4.upcase}"
90
+ end
91
+ # NEW: Fix letter suffix before r (e.g., "53ar1" → "53Ar1")
92
+ # For patterns like NIST SP 800-53ar1 where letter is between number and revision
93
+ cleaned = cleaned.gsub(/(\d)([a-z])(r\d)/) { "#{$1}#{$2.upcase}#{$3}" }
94
+ # NOTE: Removed uppercase letter before r rule - it was breaking 800-56Ar2 parsing
95
+ # The parser should handle 56Ar2 as a single unit (letter suffix + revision)
96
+
97
+ # Fix letter suffix before volume: "1-2bv1" → "1-2Bv1" (MR format)
98
+ # BUT preserve "rv" (revision year) patterns: "1013rv1953" stays as-is
99
+ # Skip for NCSTAR to preserve lowercase letters (patterns like "1-1av1" should stay lowercase)
100
+ is_ncstar = cleaned.include?("NCSTAR")
101
+ unless is_ncstar
102
+ cleaned = cleaned.gsub(/(\d)([a-z&&[^r]])(v\d+)/) do
103
+ "#{$1}#{$2.upcase}#{$3}"
104
+ end
105
+ end
106
+
107
+ # Fix space before volume number: "80-2073 2" → "80-2073 v2" (Session 219)
108
+ # This handles NBS IR 80-2073 2 and NBS IR 80-2073 3 as volume identifiers
109
+ cleaned = cleaned.gsub(/(\d{2}-\d{4})\s+(\d)$/, '\1 v\2')
110
+
111
+ # Fix draft with number: "8270-draft2" → "8270 -draft 2" (Session 253)
112
+ # Space BEFORE dash AND after draft to separate it from report_number
113
+ cleaned = cleaned.gsub(/(\d)-draft(\d)/, '\1 -draft \2')
114
+
115
+ # NEW FIX 2: Draft without dash: "8270draft2" → "8270 -draft 2"
116
+ # More lenient pattern to catch missing dash before draft
117
+ cleaned = cleaned.gsub(/(\d)draft(\d)/, '\1 -draft \2')
118
+
119
+ # Fix supplement typo: "154suprev" → "154supprev" (Session 219)
120
+ cleaned = cleaned.gsub(/(\d)suprev/, '\1supprev')
121
+
122
+ # Fix letter suffix + revision before draft: "140Cr1-draft2" → "140C r1-draft2" (Session 221)
123
+ # Must be BEFORE general draft preprocessing at line 47
124
+ cleaned = cleaned.gsub(/(\d{2,})([A-Z])(r\d+)([-\s]draft\d*)/,
125
+ '\1\2 \3\4')
126
+
127
+ # Convert Roman numeral volumes to Arabic per NIST spec (page 7)
128
+ # "1011-I-2.0" → "1011 v1 ver2.0"
129
+ # "1011-II-1.0" → "1011 v2 ver1.0"
130
+ cleaned = cleaned.gsub(/(\d+)-([IVX]+)-(\d+(?:\.\d+)*)/) do
131
+ number = $1
132
+ roman = $2
133
+ version_part = $3
134
+
135
+ # Convert Roman to Arabic
136
+ arabic = roman_to_arabic(roman)
137
+
138
+ # Convert to volume+version format
139
+ "#{number} v#{arabic} ver#{version_part}"
140
+ end
141
+
142
+ # Fix LCIRC supplement with slash and year: "118supp3/1926" → "118 supp3/1926"
143
+ cleaned = cleaned.gsub(/(\d)(supp\d+\/\d{4})/, '\1 \2')
144
+
145
+ # Fix Pt pattern: "800-57Pt3r1" → "800-57 pt3 r1"
146
+ cleaned = cleaned.gsub(/(\d)Pt(\d+)(r\d+)/, '\1 pt\2 \3')
147
+
148
+ # Fix version patterns: "ver1e2006" → "ver1 e2006", "ver2v1" → "ver2 v1"
149
+ cleaned = cleaned.gsub(/(\d)ver(\d)/, '\1 ver \2')
150
+ cleaned = cleaned.gsub(/ver(\d+)e(\d{4})/, 'ver\1 e\2')
151
+ cleaned = cleaned.gsub(/ver(\d+)v(\d+)/, 'ver\1 v\2')
152
+
153
+ # Fix dotted version: separate from number "268v1.1" → "268 v1.1"
154
+ cleaned = cleaned.gsub(/(\d)(v\d+\.\d+)/, '\1 \2')
155
+
156
+ # CRITICAL: Now separate dotted versions from preceding digits: "268v1.1" → "268 v1.1" (NEW)
157
+ cleaned = cleaned.gsub(/(\d)(v\d+\.\d+)/, '\1 \2')
158
+
159
+ # NEW: Separate version from number AND convert spaces to dots in one step
160
+ cleaned = cleaned.gsub(/(\d)(v\d+)\s+(\d+)$/, '\1 \2.\3') # Two-part: "268v1 1" → "268 v1.1"
161
+ cleaned = cleaned.gsub(/(\d)(v\d+)\s+(\d+)\s+(\d+)$/, '\1 \2.\3.\4') # Three-part: "63v1 0 1" → "63 v1.0.1"
162
+
163
+ # Fix volume ranges: "535v2a-l" → "535 v2a-l", "535v2m-z" → "535 v2m-z"
164
+ cleaned = cleaned.gsub(/(\d)(v\d+[a-z]-[a-z])/, '\1 \2')
165
+
166
+ # NEW: Fix volume with uppercase letter: "48v3B" → "48 v3B" (Session 220)
167
+ cleaned = cleaned.gsub(/(\d)(v\d+[A-Z])/, '\1 \2')
168
+
169
+ # NEW: Fix volume ranges with uppercase: "v2A-L" → "v2a-l" (normalize to lowercase) (Session 220)
170
+ cleaned = cleaned.gsub(/(v\d+)([A-Z])-([A-Z])/, '\1\2-\3'.downcase)
171
+
172
+ # NEW: Fix edition with "ed." suffix: "2006ed." → "e2006" (V1 compatibility)
173
+ # Pattern appears at end of identifier: "NIST SP 260-162 2006ed."
174
+ cleaned = cleaned.gsub(/(\d{4})ed\./, 'e\1')
175
+
176
+ # CRITICAL: Fix revision attached to number BEFORE update patterns!
177
+ # "8115r1-upd" → "8115 r1-upd" so that later "r1-upd" → "r1 -upd" works
178
+ # But preserve r6/1925 format (don't add space before slash/year)
179
+ # And preserve 300-8r1/upd format (don't separate r1/upd)
180
+ # ENHANCED: Also handle r1a (revision with letter suffix) - "800-22r1a" → "800-22r1A"
181
+ # FIXED: When there's a letter suffix, keep together for second_number pattern
182
+ # CRITICAL: Use \d{1,2} instead of \d+ to limit revision to 1-2 digits, allowing [a-z] to match
183
+ # First rule: Match r+digit+letter (keep together)
184
+ cleaned = cleaned.gsub(/(\d+)(r\d{1,2})([a-z])(?=-|[A-Z]|$)/) do
185
+ num = $1
186
+ rev = $2
187
+ letter = $3
188
+ # Keep together when there's a letter suffix
189
+ "#{num}#{rev}#{letter.upcase}"
190
+ end
191
+ # Second rule: Match r+digit WITHOUT letter suffix
192
+ # CRITICAL: Use negative lookahead (?![a-zA-Z]) to avoid matching when there's a letter
193
+ # PRESERVE compact format (no space) when at end of string (NIST SP 800-53r4)
194
+ # ADD space only when followed by: dash+uppercase, uppercase letter, or /upd, /errata, /insert
195
+ cleaned = cleaned.gsub(/(\d+)(r\d{1,2})(?![a-zA-Z])(?=[A-Z]|-(?=[A-Z])|\/(?:upd|errata|insert))/) do
196
+ num = $1
197
+ rev = $2
198
+ # Add space when followed by dash+uppercase, uppercase, or update keyword
199
+ "#{num} #{rev}"
200
+ end
201
+
202
+ # Fix spaces in version/volume numbers: "v1 1" → "v1.1", "1011-I-2 0" → "1011-I-2.0"
203
+ # ENHANCED to handle multiple spaces: "v1 0 1" → "v1.0.1", "v1 0 2" → "v1.0.2"
204
+ # FIXED: Pattern must start with "v" or digit to avoid matching "rev 2013" as "v" + " 2013"
205
+ # CRITICAL: Added word boundary \b to prevent matching "v" within "rev"
206
+ # CRITICAL FIX: Use \b to ensure match starts at word boundary
207
+ cleaned = cleaned.gsub(/(\b(?:v|\d)[v\d]*[-A-Z]*)\s+(\d+)\s+(\d+)/, '\1.\2.\3') # Three parts
208
+ # CRITICAL FIX: Use \b to ensure match starts at word boundary
209
+ cleaned = cleaned.gsub(/(\b(?:v|\d)[v\d]*)\s+(\d+)/, '\1.\2') # Two parts
210
+
211
+ # Fix update patterns: ensure space before -upd or /upd (not just at end)
212
+ # Enhanced to handle optional digits after upd: -upd, -upd1, /upd, /upd1
213
+ cleaned = cleaned.gsub(/(\d+)-upd(\d*)/, '\1 -upd\2') # -upd or -upd1
214
+ cleaned = cleaned.gsub(/(\d+)\/upd(\d*)/, '\1 /upd\2') # /upd or /upd1
215
+ cleaned = cleaned.gsub(/([a-z]\d+)-upd/, '\1 -upd') # r1-upd → r1 -upd
216
+ cleaned = cleaned.gsub(/([a-z]\d+)\/upd/, '\1 /upd') # After revision: r1/upd → r1 /upd
217
+
218
+ # NEW FIX 3: MR format with letter suffix before update: "8286C-upd1" → "8286C -upd1"
219
+ # Must handle uppercase letters before -upd in MR format
220
+ cleaned = cleaned.gsub(/(\d+[A-Z])-upd(\d*)/, '\1 -upd\2') # Letter suffix + update
221
+ cleaned = cleaned.gsub(/(\d+[A-Z])\/upd(\d*)/, '\1 /upd\2') # Letter suffix + /upd variant
222
+
223
+ # Fix supplement patterns: ensure space before supplement (1st variant)
224
+ # "118supp3" already handled at line 32-33, but add "sup" variant
225
+ cleaned = cleaned.gsub(/(\d)(sup\d)/, '\1 \2') # 100-2sup1 → 100-2 sup1
226
+ # Fix supplement patterns: ensure space before supplement (2nd variant)
227
+ cleaned = cleaned.gsub(/(\d)(sup+)(\d)/, '\1 \2\3') # 100-2sup+1 → 100-2 sup+1
228
+ # Fix supplement patterns: ensure space before supplement (3rd variant)
229
+ cleaned = cleaned.gsub(/(\d)(sup\+)(\d)/, '\1 \2\3') # 100-2sup+1 → 100-2 sup+1
230
+ # Fix supplement patterns: ensure space before supplement (4th variant)
231
+ cleaned = cleaned.gsub(/(\d)(sup\d+)/, '\1 \2') # 100-2sup1 → 100-2 sup1
232
+ # Fix supplement patterns: ensure space before supplement (5th variant)
233
+ cleaned = cleaned.gsub(/(\d)(sup\d+\b)/, '\1 \2') # 100-2sup1 → 100-2 sup1
234
+
235
+ # Fix letter suffix + supplement: "378Gsup" → "378Gsupp" (NEW for LCIRC patterns)
236
+ # Normalize "sup" to "supp" for letter suffix patterns to match circ_supplement_identifier rule
237
+ cleaned = cleaned.gsub(/(\d+[A-Z])sup(\b)/, '\1supp\2') # 378Gsup → 378Gsupp
238
+
239
+ # Fix LCIRC supplement without letter suffix: "118sup12/1926" → "118supp12/1926"
240
+ # Normalize "sup" to "supp" for LCIRC patterns to match circ_supplement_identifier rule
241
+ cleaned = cleaned.gsub(/(\d+)sup(\d+\/\d{4})/, '\1supp\2') # 118sup12/1926 → 118supp12/1926
242
+
243
+ # REMOVED: Revision letter patterns that add space before revision with letter
244
+ # These conflicted with the fix at lines 131-142 which keeps "22r1a" together
245
+ # for second_number pattern matching. The comprehensive fix now handles:
246
+ # - "800-22r1a" → "800-22r1A" (kept together, uppercase letter)
247
+ # - "800-22r1" → "800-22 r1" (space added when no letter suffix)
248
+
249
+ # Fix number with letter suffix followed by standalone 'r': "56ar" → "56a r" (NEW)
250
+ cleaned = cleaned.gsub(/(\d[a-z])r\b/, '\1 r')
251
+
252
+ # Fix revision followed by language code: "r1es" → "r1 es", "r1pt" → "r1 pt" (NEW)
253
+ cleaned = cleaned.gsub(/(r\d+)(es|pt|chi|viet|port|esp)\b/, '\1 \2')
254
+
255
+ # Fix MR format translation codes: ".spa" → " spa", ".por" → " por", ".ind" → " ind" (NEW)
256
+ # Prevents 3-letter translation codes from being parsed as letter suffixes
257
+ # "NIST.SP.1262.spa" → "NIST.SP.1262 spa" (convert dot to space)
258
+ cleaned = cleaned.gsub(/^([A-Z]+)\.SP\.(\d+)\.([a-z]{2,4})$/,
259
+ '\1.SP.\2 \3')
260
+ cleaned = cleaned.gsub(/^([A-Z]+)\.([A-Z]+)\.(\d+)\.([a-z]{2,4})$/,
261
+ '\1.\2.\3 \4')
262
+
263
+ # ENHANCEMENT 1: Edition year normalization (-YYYY → eYYYY)
264
+ # Per NIST spec, trailing -YYYY should normalize to eYYYY format
265
+ # Pattern: number (optionally with non-e letter suffix) followed by dash and 4-digit year
266
+ # Examples: "330-2019" → "330e2019", "304a-2017" → "304Ae2017"
267
+ # Must NOT match existing edition patterns like "11e2-1915" (e2 is edition, -1915 is separate)
268
+ # Must be at end or before space to avoid breaking number-number patterns like "800-53"
269
+ # Negative lookbehind (?<![eE-]) prevents matching after e/E or dash (avoids e2-1915 and 105-1-1990)
270
+ # EXCLUSION: Do NOT convert -YYYY for HB series (handbooks) - preserve original format
271
+ # Example: "NBS HB 130-1979" should stay as "NBS HB 130-1979" (not convert to e1979)
272
+ # EXCLUSION: Do NOT convert -YYYY when preceded by "e\d+" (edition+year pattern like "44e2-1955")
273
+ # EXCLUSION: Only convert years in NBS (1901-1988) or NIST (1988-2099) range
274
+ # Numbers outside this range are part numbers, not edition years (e.g., SP 250-1039)
275
+ # Use a more specific pattern: only convert when NOT preceded by "e" + digits (edition)
276
+ # AND only convert when year is in valid range (1901-2099)
277
+ cleaned = cleaned.gsub(/(?<!e\d)(?<![eE-])(\d(?:[A-DF-Z]?))-(\d{4})(?=\s|$)/) do |match|
278
+ prefix = $1 # Number with optional letter
279
+ year = $2.to_i
280
+ # Only convert to edition format if year is in valid range
281
+ if year.between?(1901, 2099)
282
+ "#{prefix}e#{year}"
283
+ else
284
+ match # Keep dash format for part numbers (e.g., 250-1039)
285
+ end
286
+ end
287
+ # Revert the conversion for HB series to preserve -YYYY format
288
+ # Matches both "HB 130e1979" and "HB 105-1e1990" patterns
289
+ # Use [^:\s.]*? (exclude dots) to avoid consuming MR format dot separators
290
+ # This prevents "NIST.HB.135e2022" from being incorrectly reverted
291
+ cleaned = cleaned.gsub(/\b(HB|HB\s+)[^:\s.]*?(\d+)e(\d{4})(?=\s|$)/,
292
+ '\1\2-\3')
293
+ # Revert the conversion for OWMWP series to preserve date format MM-DD-YYYY
294
+ # OWMWP uses date as the number: "06-13-2018" (not an edition)
295
+ # Pattern: "OWMWP 06-13e2018" → "OWMWP 06-13-2018"
296
+ cleaned = cleaned.gsub(
297
+ /\b(OWMWP|OWMWP\s*)[^:\s]*?(\d{2})-(\d{2})e(\d{4})(?=\s|$)/, '\1\2-\3-\4'
298
+ )
299
+ # Revert the conversion for RPT series to preserve year range format YYYY-YYYY
300
+ # Report series uses year ranges as the number: "1946-1947" (not an edition)
301
+ # Pattern: "RPT 1946e1947" → "RPT 1946-1947"
302
+ # Note: This must check that first year < second year (forward range)
303
+ cleaned = cleaned.gsub(/\b(RPT|RPT\s*)([^:\s]*?)(\d{4})e(\d{4})(?=\s|$)/) do |match|
304
+ prefix = $1 # "RPT" or "RPT "
305
+ separator = $2 # "." or "" or other non-colon, non-space chars
306
+ first_year = $3.to_i
307
+ second_year = $4.to_i
308
+ # Only revert if first < second (year range like 1946-1947)
309
+ if first_year < second_year
310
+ "#{prefix}#{separator}#{first_year}-#{second_year}"
311
+ else
312
+ match # Keep e format for editions like e2018e2019
313
+ end
314
+ end
315
+
316
+ # ENHANCEMENT 2: Version normalization (v1.1 → ver1.1, Ver. 2.0 → ver2.0)
317
+ # Normalize short v format to verbose ver format per NIST spec
318
+ # Already handled in version rule, but normalize in preprocessing for consistency
319
+
320
+ # CRITICAL: MR format version normalization must come BEFORE general v normalization
321
+ # Pattern: "NIST.SP.500-281-v1.0" → "NIST.SP.500-281.ver1.0"
322
+ # This allows report_number to match "500-281" and version rule to match ".ver1.0"
323
+ cleaned = cleaned.gsub(/-v(\d+\.\d+)/, '.ver\1')
324
+
325
+ # Handle Ver. with period: "Ver. 2.0" → "ver2.0" (remove period and space)
326
+ cleaned = cleaned.gsub(/\bVer\.\s+(\d+(?:\.\d+)*)/, 'ver\1')
327
+ # Handle verbose "v" to "ver": "v1.1" → "ver1.1" (only with dots - versions have dots)
328
+ cleaned = cleaned.gsub(/\bv(\d+\.\d+(?:\.\d+)*)/, 'ver\1')
329
+
330
+ # Fix uppercase P for part: "428P1" → "428 p1", "647P2" → "647 p2" (NEW)
331
+ cleaned = cleaned.gsub(/(\d)P(\d)/, '\1 p\2')
332
+
333
+ # Normalize part notation: "p1" → "pt1", "n1" → "pt1" for consistency
334
+ # This handles patterns like "61p1" → "61pt1" and "467n1" → "467pt1"
335
+ # MUST come AFTER uppercase P normalization
336
+ # EXCLUDE pattern: {number}p{digit}{4-digit-year} like "28p11969" (part + year, not part notation)
337
+ # Use negative lookahead to avoid matching when p/n + digit is followed by exactly 4 digits (year)
338
+ cleaned = cleaned.gsub(/\b([pn])(\d+)(?!\d{4}\b)/, 'pt\2')
339
+
340
+ # Fix complex part patterns in MR format: ensure space before part
341
+ cleaned = cleaned.gsub(/(\d)([pP]\d+)/, '\1 \2') # .467p1adde1 → .467 p1adde1, 800-57p1 → 800-57 p1
342
+
343
+ # Fix CRPL-F series: ensure space after series (e.g., "CRPL-F-B150" → "CRPL-F-B 150")
344
+ cleaned = cleaned.gsub(/(NBS CRPL-F-[AB])(\d)/, '\1 \2')
345
+ cleaned = cleaned.gsub(/(CRPL-F-[AB])(\d)/, '\1 \2')
346
+
347
+ # Extract volume from number: "17-917v3" → "17-917 v3", "1-1v1" → "1-1 v1"
348
+ # Pattern: digits-digits followed by v and digits (GCR, NCSTAR patterns)
349
+ # MUST be specific to avoid breaking existing "v1.1" patterns
350
+ cleaned = cleaned.gsub(/(\d+-\d+)(v\d+)(?![.\d])/, '\1 \2') # Negative lookahead for dots
351
+
352
+ # pd_suffix rule handles " 2pd" directly (space >> digits >> str("pd"))
353
+ # No preprocessing needed - adding space before "pd" breaks the parser
354
+
355
+ # Fix "Suppl" with space: "955 Suppl" → "955Suppl"
356
+ cleaned = cleaned.gsub(/(\d+)\s+Suppl\b/, '\1Suppl')
357
+
358
+ # Fix verbose "Version" format: " Version 2" → " ver 2"
359
+ cleaned = cleaned.gsub(/\s+Version\s+(\d+)/, ' ver \1')
360
+
361
+ # Fix verbose "Revision" format: " Revision (r)" → " r"
362
+ cleaned = cleaned.gsub(/\s+Revision\s+\(r\)/, " r")
363
+
364
+ # Fix verbose "rev YYYY" format: "126 rev 2013" → "126r2013"
365
+ # Removes space between number and "rev", and converts to "r" prefix
366
+ # Handles patterns like "NIST SP 260-126 rev 2013" → "NIST SP 260-126r2013"
367
+ cleaned = cleaned.gsub(/(\d+)\s+rev\s+(\d{4})/, '\1r\2')
368
+
369
+ # Fix historical "report ;" format: "NBS report ; 8079" → "NBS RPT 8079"
370
+ # The semicolon and "report" (spelled out) are historical formats
371
+ cleaned = cleaned.gsub(/\breport\s*;\s*/, "RPT ")
372
+ cleaned = cleaned.gsub(/\breport\b/, "RPT")
373
+
374
+ # REMOVED: Incorrect dot preprocessing that treated dots as number separators
375
+ # This was semantically wrong - dots are PART separators in NIST!
376
+ # DELETE: cleaned = cleaned.gsub(/(\d{3,})\.(\d{1,4})(?=\s|$)/, '\1_\2')
377
+
378
+ # REMOVED: Incorrect space-to-underscore that treated as single number
379
+ # DELETE: cleaned = cleaned.gsub(/(\d{3,})\s+(\d{1,2})$/, '\1_\2')
380
+
381
+ # Detect format before parsing
382
+ format = detect_format(input.to_s)
383
+
384
+ # Use parslet parser instance
385
+ result = new.parse(cleaned)
386
+
387
+ # Add format to result
388
+ if result.is_a?(Hash)
389
+ result.merge(parsed_format: format)
390
+ elsif result.is_a?(Array)
391
+ # For array results, merge all hashes into one
392
+ # This handles cases where identifier rule returns multiple components (e.g., compound_series + edition)
393
+ merged = result.inject({}) do |acc, hash|
394
+ next acc unless hash.is_a?(Hash)
395
+
396
+ acc.merge(hash)
397
+ end
398
+ merged.merge(parsed_format: format)
399
+ else
400
+ result
401
+ end
402
+ end
403
+
404
+ # Detect format from input string
405
+ # :mr if contains dots (machine-readable: NIST.SP.800-53)
406
+ # :short otherwise (default: NIST SP 800-53)
407
+ def self.detect_format(input)
408
+ # Check if it has dot separators (MR format pattern)
409
+ # Patterns include:
410
+ # - "NIST.SP.800-53" (publisher.series.number)
411
+ # - "FIPS.46e1977" (series.numberWithEdition)
412
+ # - "NBS.HB.28pt1e1969" (publisher.series.part.edition)
413
+ # Key indicator: dots between components instead of spaces
414
+ if input.include?(".") && !input.match?(/\s/)
415
+ :mr
416
+ else
417
+ :short
418
+ end
419
+ end
420
+
421
+ # Convert Roman numerals to Arabic numbers
422
+ # I→1, II→2, III→3, IV→4, V→5, VI→6, VII→7, VIII→8, IX→9, X→10
423
+ def self.roman_to_arabic(roman)
424
+ case roman
425
+ when "I" then "1"
426
+ when "II" then "2"
427
+ when "III" then "3"
428
+ when "IV" then "4"
429
+ when "V" then "5"
430
+ when "VI" then "6"
431
+ when "VII" then "7"
432
+ when "VIII" then "8"
433
+ when "IX" then "9"
434
+ when "X" then "10"
435
+ else roman # Fallback for unexpected patterns
436
+ end
437
+ end
438
+
439
+ # Basic building blocks
440
+ rule(:space) { str(" ") }
441
+ rule(:dot) { str(".") }
442
+ rule(:dash) { str("-") }
443
+ rule(:slash) { str("/") }
444
+ rule(:digit) { match("[0-9]") }
445
+ rule(:digits) { digit.repeat(1) }
446
+ rule(:letter) { match("[A-Za-z]") }
447
+ rule(:upper_letter) { match("[A-Z]") }
448
+ rule(:lower_letter) { match("[a-z]") }
449
+
450
+ # Hash prefix for machine-readable formats
451
+ rule(:hash_prefix) { str("#") }
452
+
453
+ # Month abbreviations
454
+ rule(:month_abbrev) do
455
+ str("January") | str("February") | str("March") | str("April") |
456
+ str("May") | str("June") | str("July") | str("August") |
457
+ str("September") | str("October") | str("November") | str("December") |
458
+ str("Jan") | str("Feb") | str("Mar") | str("Apr") |
459
+ str("Jun") | str("Jul") | str("Aug") | str("Sep") | str("Oct") | str("Nov") | str("Dec")
460
+ end
461
+
462
+ # Language codes for translations - 2-4 letter codes
463
+ # Supports: " spa", "(spa)", ".spa" (MR format)
464
+ rule(:language_code) do
465
+ ((space | dot).maybe >> (str("es") | str("pt") | str("chi") | str("viet") | str("port") | str("esp") |
466
+ match("[a-z]").repeat(2, 4))).as(:translation)
467
+ end
468
+
469
+ # Stage ID: i (initial), f (final), 1-9 (numbered iterations)
470
+ rule(:stage_id) do
471
+ str("i") | str("I") | str("f") | str("F") |
472
+ str("1") | str("2") | str("3") | str("4") | str("5") |
473
+ str("6") | str("7") | str("8") | str("9")
474
+ end
475
+
476
+ # Stage type: pd (public draft), wd (work-in-progress), prd (preliminary)
477
+ rule(:stage_type) do
478
+ str("pd") | str("PD") | str("wd") | str("WD") | str("prd") | str("PRD")
479
+ end
480
+
481
+ # Old style stage: (IPD), (FPD), (2PD) - parenthetical at document start
482
+ rule(:old_stage) do
483
+ str("(") >> (stage_id.as(:stage_id) >> stage_type.as(:stage_type)).as(:stage) >> str(")")
484
+ end
485
+
486
+ # New style stage: " ipd", ".ipd" - inline at document end
487
+ rule(:new_stage) do
488
+ (space | dot) >> (stage_id.as(:stage_id) >> stage_type.as(:stage_type)).as(:stage)
489
+ end
490
+
491
+ # Publisher
492
+ rule(:publisher) do
493
+ (str("NBS") | str("NIST")).as(:publisher)
494
+ end
495
+
496
+ # Compound series (include publisher in series name) - must be checked FIRST
497
+ rule(:compound_series) do
498
+ (
499
+ # Longest patterns first to avoid partial matches
500
+ str("NBS BRPD-CRPL-D") | str("NBS CRPL-F-A") | str("NBS CRPL-F-B") |
501
+ str("NBS CS-E") | str("CSRC Building Block") | str("CSRC Use Case") | str("CSRC Book") |
502
+ str("ITL Bulletin") | str("NSRDS-NBS") |
503
+ # NBS and NIST specific patterns that conflict with simple series
504
+ # CRITICAL: Put longer patterns before shorter to avoid partial matches!
505
+ str("NIST LCIRC") | str("NBS LCIRC") | str("NIST.LCIRC") | str("NBS.LCIRC") | str("NBS RPT") |
506
+ str("NIST PS") | str("NIST DCI") | str("NIST Other") |
507
+ str("NISTPUB") |
508
+ str("NBS CSM") | str("NBS CIRC") | str("NBS.CRPL") | str("NBS CRPL") | str("NBS CS") |
509
+ str("NBS CIS") | str("NBS HR") | str("NBS IRPL") | str("NBS IP") | str("NBS PS") |
510
+ str("NBS BH")
511
+ ).as(:series)
512
+ end
513
+
514
+ # Simple series (no publisher prefix)
515
+ rule(:simple_series) do
516
+ (
517
+ str("AMS") | str("VTS") | # NEW - Added for NIST AMS and VTS series
518
+ str("BSS") | str("BMS") | str("BH") |
519
+ str("FIPS") | str("GCR") | str("HB") | str("MONO") |
520
+ str("MP") | str("NCSTAR") | str("NSRDS") | str("IR") |
521
+ str("SP") | str("TN") | str("CSWP") |
522
+ str("AI") | str("CIRC") | str("CS") | str("CSM") |
523
+ str("CRPL") | str("LCIRC") | str("OWMWP") | str("PC") | str("RPT") |
524
+ str("SIBS") | str("TIBM") | str("TTB") | str("EAB") |
525
+ str("JPCRD") | str("JRES")
526
+ ).as(:series)
527
+ end
528
+
529
+ # Suffix letter(s) after number - supports single letters and specific two-letter suffixes
530
+ # Two-letter suffixes: Ur (Unclassified Revised), Ua (Unclassified Amended), Ub-Uj (series variants)
531
+ # Single letter: any letter not followed by excluded keywords
532
+ rule(:number_suffix) do
533
+ (str("U") >> lower_letter) | (match("[a-zA-Z]") >> (
534
+ # Match suffixes
535
+ str("ec") |
536
+ str("ndex") |
537
+ str("nsert") |
538
+ str("rrata") |
539
+ str("raft") | # NEW: Exclude "draft" from number suffix matching
540
+ str("pp") |
541
+ str("s") |
542
+ str("t") |
543
+ str("hi") |
544
+ str("iet") |
545
+ str("ort") |
546
+ str("r") | # NEW: Exclude "r" revision marker (e.g., r5, r1963)
547
+ str("p") # NEW: Exclude "p" part marker (e.g., 28p11969 - part with year pattern)
548
+ ).absent? >>
549
+ digits.maybe)
550
+ end
551
+
552
+ rule(:digits_with_suffix) do
553
+ digits >>
554
+ # Suffix only if not followed by digit (e.g., don't match 'e' in '140e2')
555
+ (number_suffix >> digit.absent?).maybe
556
+ end
557
+
558
+ # Report number - first part - support edition prefixes like "e104" and supplement suffixes like "144supp"
559
+ # Supplements should be handled as separate parts
560
+ rule(:first_number) do
561
+ (
562
+ # OWMWP date format: MM-DD-YYYY (e.g., 06-13-2018)
563
+ # Must be FIRST to match before other dash patterns
564
+ (match("[0-9]").repeat(2, 2).as(:owmwp_month) >> dash >>
565
+ match("[0-9]").repeat(2, 2).as(:owmwp_day) >> dash >>
566
+ match("[0-9]").repeat(4, 4).as(:owmwp_year)).as(:owmwp_date_number) |
567
+ # Special text patterns - MOST SPECIFIC FIRST (NEW for RPT patterns)
568
+ str("ADHOC") | (str("div") >> digits) |
569
+ # Month ranges for RPT: Apr-Jun1948 (NEW)
570
+ (month_abbrev >> dash >> month_abbrev >> digits) |
571
+ # Number with volume suffix (e.g., "539v10" for CIRC, "1011v1" for general patterns)
572
+ # CRITICAL: Must be before CS series pattern to avoid consuming "GB" as letter suffix
573
+ (digits.as(:number) >> str("v") >> digits.as(:volume_suffix)).as(:number_with_volume) |
574
+ # Roman numeral patterns: 1011-I-2.0, 1011-II-1.0 (ENHANCED to accept optional dots)
575
+ (digits >> dash >> (str("III") | str("II") | str("IV") | str("I") | str("V") | str("VI") | str("VII") | str("VIII") | str("IX") | str("X")) >> dash >> digits >> (dot >> digits).maybe) |
576
+ # GB series pattern: 1190GB-1, 1190GB-4A
577
+ (digits >> str("GB") >> dash >> digits >> upper_letter.maybe) |
578
+ # CS series pattern with letter in middle: 102E-42, 123A-50
579
+ (digits >> upper_letter >> dash >> digits) |
580
+ # Volume-number format for CSM series: v6n1, v7n12
581
+ # CHANGED: Capture volume and issue_number separately for proper semantics
582
+ (str("v") >> digits.as(:volume_number) >> str("n") >> digits.as(:issue_number)) |
583
+ # Regular number with supplement and revision suffix: "154supprev"
584
+ (digits >> str("supprev")) |
585
+ # Regular number with edition and revision year-only: "13e2rev1908"
586
+ (digits >> str("e") >> digits >> str("rev") >> digits) |
587
+ # NEW: Number with revision year (rv pattern for LetterCircular): "1013rv1953"
588
+ (digits.as(:number) >> str("rv") >> digits.as(:revision_year)).as(:number_with_rev_year) |
589
+ # Regular number with edition, revision, and month-date: "13e2revJune1908"
590
+ (digits >> str("e") >> digits >> str("rev") >> month_abbrev >> digits) |
591
+ # Regular number with eN suffix and optional supplement (e.g., "101e2supp") - most specific
592
+ (digits >> str("e") >> digits >> str("supp") >> digits.maybe) |
593
+ # Edition prefix with revision and date: e2revJune1908
594
+ (str("e") >> digits >> str("rev") >> month_abbrev >> digits) |
595
+ # Edition prefix followed by digits and optional supplement with digits
596
+ (str("e") >> digits >> str("supp") >> digits.maybe) |
597
+ # Regular number with eN suffix (e.g., "101e2")
598
+ (digits >> str("e") >> digits) |
599
+ # NEW: Bare edition (just "e2" without number prefix)
600
+ (str("e") >> digits >> (dash >> digits).absent?) |
601
+ # Letter prefix with digits (e.g., "c4" for CRPL)
602
+ (lower_letter >> digits) |
603
+ # Regular number with supplement suffix with month/year (e.g., "24suppJan1924")
604
+ (digits >> str("supp") >> month_abbrev >> digits) |
605
+ # Regular number with supplement suffix (e.g., "144supp") - with optional digits
606
+ (digits >> str("supp") >> digits.maybe) |
607
+ # Regular number with supplement suffix followed by month/year for date range
608
+ (digits >> str("sup") >> month_abbrev >> digits) |
609
+ # Regular number with "sup" suffix (e.g., "9350sup") - NEW for RPT patterns
610
+ (digits >> str("sup")) |
611
+ # Language code suffix without separator (e.g., "1088sp")
612
+ # Must come BEFORE general suffix pattern to capture specific language codes
613
+ # Must come AFTER other patterns (like sup, supp, etc.) to avoid consuming them
614
+ # Note: Preprocessing doesn't convert attached suffixes, so we handle both cases
615
+ (digits.as(:number) >> (str("sp") | str("pt") | str("es") | str("SP") | str("PT") | str("ES")).as(:language_code) >> (upper_letter.absent? >> digit.absent? >> letter.absent? >> dash.absent? >> dot.absent?)) |
616
+ # Part+edition suffix for MR format: "28pt1e1969" (part notation + edition year)
617
+ # Handles patterns like "NBS.HB.28pt1e1969" where part and edition are attached
618
+ # Must come BEFORE language code pattern to take priority
619
+ (digits.as(:number) >> str("pt") >> digits.as(:part_number) >> str("e") >> digits.as(:edition_year)) |
620
+ # Parenthetical language code (e.g., "378(sp)")
621
+ # Must come AFTER other patterns to avoid consuming letter suffixes
622
+ # Note: Preprocessing converts content inside parentheses to uppercase
623
+ # Use specific patterns to avoid consuming other parenthetical content
624
+ (digits.as(:number) >> str("(") >> (str("SP") | str("PT") | str("ES")).as(:language_code) >> str(")")) |
625
+ # Regular number with optional suffix (original) - includes letters like "A"
626
+ digits_with_suffix
627
+ ).as(:first_number)
628
+ end
629
+
630
+ # Second number (after dash) - allow pt suffix, letter suffixes, and CRPL patterns
631
+ rule(:second_number) do
632
+ # Explicitly exclude month abbreviations at start (so -Feb1985 goes to edition, not second_number)
633
+ month_abbrev.absent? >>
634
+ # NEW: Exclude "draft" keyword
635
+ str("draft").absent? >>
636
+ (
637
+ # NEW: Revision pattern with U+letter suffix (e.g., "22r1Ua", "38Ua")
638
+ # MUST come BEFORE general letter suffix to avoid matching just "U" from "Ua"
639
+ (digits >> str("r") >> digits >> str("U") >> lower_letter) |
640
+ # NEW: Revision pattern with letter suffix (e.g., "22r1a", "22r1A" for SP patterns)
641
+ # This allows second_number to match the entire "22r1A" as a single unit
642
+ # MUST come BEFORE plain r+digits to avoid greedy match of just "22r1"
643
+ (digits >> str("r") >> digits >> match("[a-zA-Z]")) |
644
+ # NEW: Revision pattern with year (e.g., "126r2013")
645
+ # This handles SP revision format where revision is attached to second_number
646
+ (digits.as(:number_only) >> str("r") >> digits.as(:edition_id)) |
647
+ # CRPL range with underscore (e.g., "2_3-1A")
648
+ (digits >> str("_") >> digits >> dash >> digits >> upper_letter.maybe) |
649
+ # Letter followed by dash and digits (e.g., "m-5")
650
+ (lower_letter >> dash >> digits) |
651
+ # Number with pt suffix (e.g., "57pt1")
652
+ # EXCLUDE pt#-# patterns (e.g., "pt3-1") which are part components for CRPL
653
+ # Use negative lookahead to prevent matching when followed by dash
654
+ (digits >> str("pt") >> digits >> dash.absent?) |
655
+ # Number with uppercase letter suffix (e.g., "56A", "123B") - for patterns like "56Ar2"
656
+ (digits >> upper_letter) |
657
+ # NEW: Revision pattern where r is directly followed by a letter (e.g., "27ra" -> rA)
658
+ # For patterns like NIST SP 800-27ra where revision 'ra' is attached directly to number
659
+ (digits.as(:number_only) >> str("r") >> match("[a-zA-Z]").as(:letter)).as(:revision_letter) |
660
+ # NEW: Revision pattern where r is directly followed by a letter without leading digits (e.g., "rA")
661
+ # For patterns like NIST SP 800-27ra where revision 'ra' is attached directly to number
662
+ (str("r") >> match("[a-zA-Z]")).as(:revision_letter_suffix) |
663
+ # NEW: Simple revision pattern r followed by digits (e.g., "r1", "r2") for trailing revision
664
+ (str("r") >> digits.as(:edition_id)).as(:revision_simple) |
665
+ # Special patterns like "NCNR", "PERMIS", "BFRL"
666
+ str("NCNR") | str("PERMIS") | str("BFRL") |
667
+ # Just capital letters (e.g., "A", "B", "C") - standalone
668
+ upper_letter.repeat(1, 3) |
669
+ # Regular number with optional suffix - but NOT if part of FIPS date (digit-dash-month-digit-slash)
670
+ (digits_with_suffix >> (dash >> month_abbrev >> digits >> slash).absent?) |
671
+ # Single lowercase letter (e.g., "a", "b") - but NOT "r" followed by digits (edition marker)
672
+ # This is for patterns like "126a" but not "126r2"
673
+ (lower_letter >> digit.absent?)
674
+ ).as(:second_number)
675
+ end
676
+
677
+ # Edition component per NIST spec: <edition-type><edition-id>
678
+ # Type: "e" (edition), "r" (revision), "rev" (revision verbose), "-" (historical)
679
+ # ID: number (1-9) or year (yyyy)
680
+ # Examples: e2, e2021, r5, rev2013, rev 2013, -3
681
+ # Enhanced: Support space-separated format from preprocessing (r1 separated from number)
682
+ rule(:edition) do
683
+ # Edition with "e" prefix: e2, e3, e2021 (1-4 digits for ID)
684
+ (space.maybe >> str("e") >> digits.as(:edition_id)).as(:edition_e) |
685
+ # Revision with "r" prefix and SPACE, with letter: r 5A (preserve format)
686
+ (space >> str("r") >> digits.as(:edition_id) >> match("[a-zA-Z]").as(:edition_letter)).as(:edition_r_with_space_letter) |
687
+ # Revision with "r" prefix and SPACE: r 5 (preserve format)
688
+ (space >> str("r") >> digits.as(:edition_id)).as(:edition_r_with_space) |
689
+ # Revision with "r" prefix NO space, with letter: r5A (compact format)
690
+ (str("r") >> digits.as(:edition_id) >> match("[a-zA-Z]").as(:edition_letter)).as(:edition_r_no_space_letter) |
691
+ # Revision with "r" prefix NO space: r5 (compact format)
692
+ (str("r") >> digits.as(:edition_id)).as(:edition_r_no_space) |
693
+ # Revision with "rev" prefix (verbose): rev2013, rev 2013
694
+ (space.maybe >> str("rev") >> space.maybe >> digits.as(:edition_id)).as(:edition_rev) |
695
+ # Historical with "-" prefix: -2, -3 (ONLY if followed by non-digit or end)
696
+ # This avoids consuming date patterns like "-1908"
697
+ # Historical precedent uses small numbers (1-9), dates use 4-digit years
698
+ (dash >> match("[1-9]").as(:edition_id) >> digit.absent?).as(:edition_historical) |
699
+ # Edition dash-year pattern: -1979, -1990 (dash + 4-digit year)
700
+ # This matches year-only editions like "NBS HB 130-1979"
701
+ (dash >> match("[0-9]").repeat(4,
702
+ 4).as(:dash_year)).as(:edition_dash_year)
703
+ end
704
+
705
+ # Date component per NIST spec: -{YYYY} or -{YYYYMM} or -{YYYYMMDD}
706
+ # Separate from Edition - both can coexist
707
+ # Examples: -1908, -190806, -19770930
708
+ rule(:date) do
709
+ (
710
+ # Date with month and day: -19770930 (YYYYMMDD)
711
+ (dash >> match("[0-9]").repeat(4, 4).as(:date_year) >>
712
+ match("[0-9]").repeat(2, 2).as(:date_month) >>
713
+ match("[0-9]").repeat(2, 2).as(:date_day)) |
714
+ # Date with month: -190806 (YYYYMM)
715
+ (dash >> match("[0-9]").repeat(4, 4).as(:date_year) >>
716
+ match("[0-9]").repeat(2, 2).as(:date_month)) |
717
+ # Date with year only: -1908 (YYYY)
718
+ (dash >> match("[0-9]").repeat(4, 4).as(:date_year)) |
719
+ # Legacy month format: -June1908, -Jan1925 (normalize to YYYYMM)
720
+ (dash >> month_abbrev.as(:date_month) >> digits.as(:date_year))
721
+ ).as(:date)
722
+ end
723
+
724
+ # LEGACY EDITION PATTERNS (for backward compatibility during migration)
725
+ # These will be gradually replaced as we migrate to proper Edition/Date components
726
+ rule(:legacy_edition) do
727
+ # Complex revision patterns: r1a, r2b
728
+ ((str("r") | str(" R")) >> match("[0-9]").repeat(1,
729
+ 2).as(:edition) >> lower_letter.as(:edition_letter)) |
730
+ # Edition with revision and year: rev2013, rev2020, rev 2013 (with space)
731
+ (str("rev") >> space.maybe >> digits.as(:edition_year)) |
732
+ # Edition with revision and date: e2revJune1908 (will migrate to e2 + date)
733
+ ((str("e") | str(" E")) >> match("[0-9]").repeat(1, 3).as(:edition) >>
734
+ str("rev") >> match("[A-Za-z]").repeat(3,
735
+ 9).as(:edition_month) >> digits.as(:edition_year)) |
736
+ # Edition with year and month: e201801 (ambiguous - could be e2018 or year 2018 month 01)
737
+ (str("e") >> match("[0-9]").repeat(4,
738
+ 4).as(:edition_year) >> match("[0-9]").repeat(
739
+ 2, 2
740
+ ).as(:edition_month).maybe) |
741
+ # Revision-based edition: revJune1908, revJan1925 (normalize to date)
742
+ (str("rev") >> match("[A-Za-z]").repeat(3,
743
+ 9).as(:edition_month) >> digits.as(:edition_year))
744
+ end
745
+
746
+ # CRPL range pattern (e.g., 1-2_3-1, 1-2_3-1A with suffix) - matches after first dash
747
+ rule(:crpl_range) do
748
+ (digits >> str("_") >> digits >> dash >> digits >> upper_letter.maybe).as(:crpl_range)
749
+ end
750
+
751
+ # Full report number - support dot-separated parts AND CRPL ranges
752
+ # ENHANCED: Support multiple dashes for GCR patterns (Session 220)
753
+ # FIXED: Put GCR pattern first to prioritize matching full dash-separated patterns
754
+ # FIXED: Add edition.maybe to support revision patterns like 800-53r5 in short format
755
+ # FIXED: Month abbreviation as edition (e.g., 107-Mar1985, 11-Jan1925)
756
+ # FIXED: FIPS date format with day and slash (e.g., 11-1-Sep30/1977)
757
+ rule(:report_number) do
758
+ first_number >>
759
+ (
760
+ # Month abbreviation as edition (e.g., 107-Mar1985, 11-Jan1925)
761
+ # MUST BE FIRST to catch -MonthYear patterns before they're
762
+ # incorrectly parsed as other alternatives
763
+ (dash >> month_abbrev.as(:edition_month) >> digits.as(:edition_year)) |
764
+ # FIPS date format: -1-Sep30/1977 (part-month-day/year with slash)
765
+ # Must come before GCR pattern to avoid being matched as multi-dash
766
+ (dash >> digits.as(:fips_part) >> dash >> month_abbrev.as(:edition_month) >>
767
+ digits.as(:edition_day) >> slash >> digits.as(:edition_year)) |
768
+ # Dash with decimal suffix (e.g., 80-2073.3, 123-45.67)
769
+ # Must come before GCR pattern which expects another dash after second_number
770
+ (dash >> digits.as(:decimal_base) >> dot >> digits.as(:decimal_suffix)).as(:decimal_number) |
771
+ # Dash with letter suffix (e.g., 1-1A, 1-3B for NCSTAR, 73-197Ur for IR)
772
+ # Must come before GCR pattern which expects another dash
773
+ # Supports U+lowercase letter suffix (e.g., Ur, Ua, Ub-Uj for Unclassified variants)
774
+ # For other uppercase letters, only match single letter (A, B, C) to avoid consuming revision r
775
+ (dash >> digits.as(:letter_base) >> (
776
+ (str("U") >> lower_letter.as(:letter_suffix_extra)) |
777
+ upper_letter
778
+ ).as(:letter_suffix)).as(:letter_number) |
779
+ # Edition dash-year pattern (e.g., -1979 for handbooks like "NBS HB 130-1979")
780
+ # Matches any 4-digit sequence - the builder decides if it's a year or second_number
781
+ (dash >> match("[0-9]").repeat(4,
782
+ 4).as(:dash_year) >> (space | dot | part | crpl_range | second_number | dash).absent?).as(:edition_dash_year) |
783
+ # Second number followed by edition dash-year (e.g., -1-1990 for "105-1-1990")
784
+ # Handles compound numbers with edition year at the end
785
+ # MUST be BEFORE GCR pattern because both start with dash + second_number + dash
786
+ (dash >> second_number >> dash >> match("[0-9]").repeat(4,
787
+ 4).as(:dash_year) >> (space | dot | part | crpl_range | revision | draft).absent?).as(:second_number_edition_year) |
788
+ # FIPS month+year pattern after part (e.g., -1-Sep1977 for "11-1-Sep1977")
789
+ (dash >> second_number >> dash >> month_abbrev.as(:edition_month) >> digits.as(:edition_year) >> (space | dot | part | crpl_range | edition | revision | draft).absent?).as(:fips_month_year_after_part) |
790
+ # GCR multi-dash pattern (e.g., 85-3273-37, 19-200-30B)
791
+ (dash >> second_number >> dash >> (digits >> upper_letter.maybe).as(:part_number)) |
792
+ # Dot-separated part (e.g., 984.4 = number 984, part 4)
793
+ (dot >> second_number) |
794
+ # Dash-separated with optional revision (e.g., 800-53r5, 1019r1963)
795
+ (dash >> (crpl_range | second_number) >> edition.maybe) |
796
+ (dash >> edition)
797
+ # TODO: Language code suffix without separator (e.g., "1088sp")
798
+ # Must come AFTER other patterns to avoid consuming them
799
+ # | (str("sp") | str("pt") | str("es") | match("[a-z]").repeat(2, 4)).as(:language_code) >> (space | dot | part | crpl_range | second_number).absent?) |
800
+ # Parenthetical language code (e.g., "378(sp)")
801
+ # | (str("(") >> match("[a-z]").repeat(2, 4).as(:language_code) >> str(")") >> (space | dot | part | crpl_range | second_number).absent?)
802
+ ).maybe
803
+ end
804
+
805
+ # Volume
806
+ rule(:volume) do
807
+ (space.maybe >> (str("v") | str(" Vol. "))) >>
808
+ (digits >>
809
+ # Support letter ranges (lowercase normalized in preprocessing)
810
+ (str("a-l") | str("m-z") | str("A-L") | str("M-Z")).maybe >>
811
+ # Support single uppercase letters (e.g., v3B, v1A)
812
+ upper_letter.repeat(0, 2)).as(:volume)
813
+ end
814
+
815
+ # Part - enhanced to support patterns like p1adde1 AND pt3r1 (part with revision)
816
+ rule(:part) do
817
+ ((space.maybe >> (str("pt") | str("p") | str("P"))) | str(" Part ")) >>
818
+ (digits >>
819
+ # NEW: Revision after part number: pt3r1, p1r1 (space.maybe for preprocessing)
820
+ (space.maybe >> str("r") >> digits).maybe >>
821
+ # Existing: Addendum with optional edition: add, adde1
822
+ (str("add") >> (str("e") >> digits).maybe).maybe >>
823
+ (dash >> digits).maybe).as(:part)
824
+ end
825
+
826
+ # Revision
827
+ rule(:revision) do
828
+ # NEW: Revision with month and year: rJun1992, r Jun1992 - LONGEST MATCH FIRST
829
+ # Enhanced to support leading space (Session 219)
830
+ (space.maybe >> (str("r") | str("rev")) >> space.maybe >> month_abbrev.as(:revision_month) >> digits.as(:revision_year)) |
831
+ # Revision with slash and year: r6/1925, r11/1924 (NEW for LCIRC patterns)
832
+ (space.maybe >> (str("r") | str("rev")) >> digits.as(:revision) >>
833
+ slash >> digits.as(:revision_year)) |
834
+ # Revision with 4-digit year directly: r1995, r 1995 (allow space before year)
835
+ ((str(" r") | str("r")) >> space.maybe >> match("[0-9]").repeat(4,
836
+ 4).as(:revision_year)) |
837
+ # Revision with year: rev2013, rev 2013 (allow space before year)
838
+ (str("rev") >> space.maybe >> digits.as(:revision_year)) |
839
+ # Revision with digits AND/OR letters: r1a, r1A, ra, r1
840
+ # Enhanced to accept letter-only revisions and space before r
841
+ # ENHANCED: Accept BOTH lowercase and uppercase letters in suffix
842
+ # ENHANCED: Capture original format prefix for format preservation (e.g., " Rev. 5")
843
+ ((str(" rev ") | str("rev") | str(" r") | str("r") | str(" Rev. ") | str(" Revision (r)")).as(:revision_prefix) >>
844
+ ((digits >> match("[a-zA-Z]").maybe) | match("[a-zA-Z]").repeat(1)).as(:revision_id)).as(:revision) |
845
+ # NEW: Standalone 'r' - MUST BE LAST to avoid consuming from other patterns
846
+ # Matches " r" at end of input (after preprocessing: "800-56a r", "800-27 r")
847
+ (str(" r") >> any.absent?).as(:revision_standalone)
848
+ end
849
+
850
+ # Version - V1 SP PARSER COMPATIBLE
851
+ # Supports: ver1.0.2, ver2, " Ver. 2.0", " Version 1.0", v1.0.2, -v1.0, .ver1.0 (MR format)
852
+ rule(:version) do
853
+ # Verbose "ver" form - with or without dots (space.maybe before AND after "ver")
854
+ # ENHANCED: Accept dot prefix for MR format (e.g., "500-281.ver1.0")
855
+ ((space | dot).maybe >> str("ver") >> space.maybe >> (digits >> (dot >> digits).repeat).as(:version)) |
856
+ # Verbose forms with space: " Ver. ", " Version " - require dots
857
+ ((str(" Ver. ") | str(" Version ")) >>
858
+ (digits >> dot >> digits >> (dot >> digits).maybe).as(:version)) |
859
+ # Short form "v" with mandatory dots (v1.0, v1.0.2) - allow optional dash or space before
860
+ ((dash | space).maybe >> str("v") >> (digits >> dot >> digits >> (dot >> digits).maybe).as(:version))
861
+ end
862
+
863
+ # Update - V1 COMPATIBLE
864
+ # Format: /Upd{N}-{YYYY}{MM} where MM is optional
865
+ # Examples: /Upd1-2015, /Upd3-202102, -upd, /upd (after preprocessing)
866
+ # Update number is optional (e.g., "500-300-upd" has no number)
867
+ # Captures prefix to preserve original format (-upd vs /Upd)
868
+ rule(:update) do
869
+ prefix = (
870
+ str("/Upd") |
871
+ (space.maybe >> (str("/upd") | str("-upd")))
872
+ ).as(:update_prefix)
873
+
874
+ prefix >>
875
+ (
876
+ digits.as(:update_number).maybe >>
877
+ (dash >>
878
+ match("[0-9]").repeat(4, 4).as(:update_year) >>
879
+ match("[0-9]").repeat(2, 2).as(:update_month).maybe
880
+ ).maybe
881
+ ).as(:update)
882
+ end
883
+
884
+ # Addendum
885
+ rule(:addendum) do
886
+ ((str("-add") | str(".add") | str(" Add.")) >>
887
+ (space | dash).maybe >> (digits | str("")).as(:addendum_number)).as(:addendum)
888
+ end
889
+
890
+ # Supplement - enhanced to support date patterns, year patterns, and combined with revision
891
+ # Examples: suppJan1924, supp3/1926, supp1925, supJun1925-Jun1927 (date ranges), supprev
892
+ rule(:supplement) do
893
+ space.maybe >>
894
+ (str("supp") | str("sup")) >>
895
+ (
896
+ # Supplement followed by revision: supprev
897
+ str("rev").as(:supplement_with_rev) |
898
+ # Date range pattern: Jan1924-Jan1926
899
+ (month_abbrev.as(:supp_month_start) >> digits.as(:supp_year_start) >>
900
+ dash >> month_abbrev.as(:supp_month_end) >> digits.as(:supp_year_end)).as(:supplement_date_range) |
901
+ # Month and year: Jan1924
902
+ (month_abbrev.as(:supp_month) >> digits.as(:supp_year)).as(:supplement_date) |
903
+ # Number with slash and year: 3/1926
904
+ (digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_slash_year) |
905
+ # Just year: 1925
906
+ digits.as(:supp_year) |
907
+ # General suffix (other patterns)
908
+ match("[A-Za-z0-9]").repeat(1).as(:supplement_suffix)
909
+ ).maybe
910
+ end
911
+
912
+ # Errata
913
+ rule(:errata) do
914
+ (dash.maybe >> (str("errata") | str("err"))).as(:errata)
915
+ end
916
+
917
+ # Index
918
+ rule(:index) do
919
+ (str("index") | str("indx")).as(:index)
920
+ end
921
+
922
+ # Insert
923
+ rule(:insert) do
924
+ (str("insert") | str("ins")).as(:insert)
925
+ end
926
+
927
+ # Appendix
928
+ rule(:appendix) do
929
+ str("app").as(:appendix)
930
+ end
931
+
932
+ # Section - make digits optional for patterns like just "sec"
933
+ rule(:section) do
934
+ str("sec") >> digits.as(:section).maybe
935
+ end
936
+
937
+ # Translation (3-letter language code) - V1 COMPATIBLE
938
+ # Supports: (spa), " spa", ".spa" (MR format)
939
+ rule(:translation) do
940
+ # Parenthetical format: (spa), (por), (ind)
941
+ (str("(") >> match('\w').repeat(3, 3).as(:translation) >> str(")")) |
942
+ # Space-prefix format: " spa"
943
+ (space >> match('\w').repeat(3, 3).as(:translation)) |
944
+ # Dot-prefix format: ".spa" (machine-readable)
945
+ (dot >> match('\w').repeat(3, 3).as(:translation))
946
+ end
947
+
948
+ # Public draft suffix - for patterns like 2pd, 3pd
949
+ rule(:pd_suffix) do
950
+ (space >> digits >> str("pd")).as(:public_draft)
951
+ end
952
+
953
+ # Draft stage - enhanced to support suffix pattern and number after draft
954
+ # ENHANCED: Accept optional space before dash to match after report_number
955
+ rule(:draft) do
956
+ ((space >> str("(Draft)")) |
957
+ (space.maybe >> dash >> str("draft") >> ((space >> digits) | digits).maybe) | # Match " -draft 2" OR "-draft2"
958
+ pd_suffix).as(:draft)
959
+ end
960
+
961
+ # Special date format with slash for FIPS (part of number, not edition)
962
+ rule(:fips_date) do
963
+ dash >> digits.as(:fips_part) >> dash >> month_abbrev.as(:fips_month) >>
964
+ digits.as(:fips_day) >> slash >> digits.as(:fips_year)
965
+ end
966
+
967
+ # All possible parts (order matters!)
968
+ rule(:parts) do
969
+ # Put more specific patterns first
970
+ # CRITICAL: new_stage BEFORE language_code to avoid "ipd" being treated as translation
971
+ new_stage |
972
+ section | index | insert | appendix | pd_suffix |
973
+ edition | date | legacy_edition | revision |
974
+ version | # MOVED BEFORE volume - try dotted versions (v1.1) before simple volumes (v1)
975
+ volume | part | update | addendum |
976
+ supplement | errata | language_code
977
+ end
978
+
979
+ # CIRC Supplement identifier - split into base + supplement
980
+ # Examples:
981
+ # - "NBS CIRC 101e2supp" → base="NBS CIRC 101e2", supplement
982
+ # - "NBS CIRC 25supp-1924" → base="NBS CIRC 25", supplement_year="1924"
983
+ # - "NBS CIRC 24suppJan1924" → base="NBS CIRC 24", supplement_edition="Jan1924"
984
+ # - "NBS CIRC suppJun1925-Jun1926" → date range supplement (no base)
985
+ # - "NBS LCIRC 378Gsup" → base="NBS LCIRC 378G", supplement (no metadata)
986
+ # - "NBS.LCIRC.378sup1/1927" → dot-separated MR format (after preprocessing)
987
+ # Dot-separated machine-readable format: NIST.SP.800-116 or #NIST.2024-01-15.123
988
+ # Enhanced to support parts after number like NIST.SP.1011-I-2.0
989
+ # Enhanced to support revision+update patterns like NIST.IR.8115r1-upd
990
+ rule(:mr_identifier) do
991
+ hash_prefix.maybe >>
992
+ publisher >> dot >>
993
+ simple_series >> dot >>
994
+ report_number >>
995
+ # Edition with underscore separator (MR format: 1648_2009)
996
+ (str("_") >> digits.as(:edition_year)).maybe >>
997
+ # Support letter suffix before update (e.g., 8286C-upd1) - Session 219
998
+ upper_letter.maybe >>
999
+ # Support revision component (r1, r5, etc.) before update
1000
+ edition.maybe >>
1001
+ update.maybe >>
1002
+ # Additional dot-separated parts (parts, version, volume, etc.)
1003
+ # MUST come before translation to avoid conflicting with language codes
1004
+ (dot >> (digits | upper_letter)).repeat(0, 3) >>
1005
+ # Language codes at end (.spa, .por, .ind)
1006
+ parts.repeat >> draft.maybe
1007
+ end
1008
+
1009
+ # Main identifier structure
1010
+ # Try compound series first (longest match), then publisher + simple series
1011
+ rule(:identifier) do
1012
+ circ_supplement_identifier |
1013
+ mr_identifier |
1014
+ (
1015
+ # Compound series (includes publisher in series name)
1016
+ compound_series >> (space | dot) >>
1017
+ old_stage.maybe >> # Old style stage after series
1018
+ report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
1019
+ ) |
1020
+ (
1021
+ # Publisher + simple series - require space/dot between publisher and series
1022
+ publisher >> (space | dot) >>
1023
+ simple_series >>
1024
+ old_stage.maybe >> # Old style stage after series
1025
+ (space | dot) >>
1026
+ report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
1027
+ ) |
1028
+ (
1029
+ # Simple series only (no publisher)
1030
+ simple_series >>
1031
+ old_stage.maybe >> # Old style stage after series
1032
+ (space | dot) >>
1033
+ report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
1034
+ )
1035
+ end
1036
+
1037
+ # CIRC Supplement identifier - split into base + supplement
1038
+ # Must be complete rule with all patterns
1039
+ rule(:circ_supplement_identifier) do
1040
+ (
1041
+ (str("NBS CIRC") | str("NBS LCIRC") | str("NBS.CIRC") | str("NBS.LCIRC")).as(:series) >>
1042
+ (space | dot)
1043
+ ).as(:circ_series) >>
1044
+ (
1045
+ # Date range supplement (no base number)
1046
+ (str("supp") >> month_abbrev.as(:supp_month_start) >> digits.as(:supp_year_start) >>
1047
+ dash >> month_abbrev.as(:supp_month_end) >> digits.as(:supp_year_end)).as(:supplement_date_range) |
1048
+ # With base identifier + supplement
1049
+ (
1050
+ # Capture base portion (everything before "supp" or "sup" or slash+year)
1051
+ (
1052
+ # Number with edition: "101e2"
1053
+ (digits.as(:base_number) >> str("e") >> digits.as(:edition_number)) |
1054
+ # Number with revision (for supplement patterns): "145r11"
1055
+ (digits.as(:base_number) >> lower_letter.as(:revision_letter) >> digits.as(:revision_number)) |
1056
+ # Number with letter suffix: "378G"
1057
+ (digits.as(:base_number) >> upper_letter.as(:letter_suffix)) |
1058
+ # Just number: "25", "24"
1059
+ digits.as(:simple_number)
1060
+ ).as(:base_portion) >>
1061
+ # Supplement marker - support both "supp" and "sup", OR implicit supplement via slash+year
1062
+ (
1063
+ # Explicit supplement marker
1064
+ ((str("supp") | str("sup")) >>
1065
+ # Optional supplement metadata
1066
+ (
1067
+ (month_abbrev >> digits).as(:supplement_month_year) |
1068
+ # Dash + number + slash + year (e.g., supp-12/1926)
1069
+ (dash >> digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_dash_slash_year) |
1070
+ (dash >> digits.as(:supplement_year)) |
1071
+ (digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_slash_year) |
1072
+ str("").as(:supplement_empty)
1073
+ ).maybe) |
1074
+ # Implicit supplement via slash+year (e.g., "145r11/1925")
1075
+ (slash >> digits.as(:implicit_supplement_year)).as(:implicit_supplement)
1076
+ )
1077
+ )
1078
+ )
1079
+ end
1080
+
1081
+ root(:identifier)
1082
+ end
1083
+ end
1084
+ end