pubid 1.15.19 → 2.0.0.pre.alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +1 -1
- data/README.adoc +2041 -53
- data/archived-gems/pubid-ccsds/update_codes.yaml +1 -0
- data/archived-gems/pubid-iec/stages.yaml +129 -0
- data/archived-gems/pubid-iec/update_codes.yaml +67 -0
- data/archived-gems/pubid-ieee/update_codes.yaml +104 -0
- data/archived-gems/pubid-iso/stages.yaml +106 -0
- data/archived-gems/pubid-iso/update_codes.yaml +4 -0
- data/archived-gems/pubid-itu/i18n.yaml +13 -0
- data/archived-gems/pubid-itu/series.yaml +42 -0
- data/archived-gems/pubid-nist/publishers.yaml +6 -0
- data/archived-gems/pubid-nist/series.yaml +121 -0
- data/archived-gems/pubid-nist/stages.yaml +16 -0
- data/archived-gems/pubid-nist/update_codes.yaml +93 -0
- data/archived-gems/pubid-plateau/update_codes.yaml +6 -0
- data/data/ccsds/update_codes.yaml +1 -0
- data/data/iec/update_codes.yaml +67 -0
- data/data/ieee/update_codes.yaml +104 -0
- data/data/iso/update_codes.yaml +21 -0
- data/data/nist/update_codes.yaml +89 -0
- data/data/plateau/update_codes.yaml +6 -0
- data/lib/pubid/amca/builder.rb +176 -0
- data/lib/pubid/amca/identifier.rb +57 -0
- data/lib/pubid/amca/identifiers/base.rb +64 -0
- data/lib/pubid/amca/identifiers/interpretation.rb +51 -0
- data/lib/pubid/amca/identifiers/publication.rb +47 -0
- data/lib/pubid/amca/identifiers/standard.rb +22 -0
- data/lib/pubid/amca/identifiers.rb +12 -0
- data/lib/pubid/amca/parser.rb +153 -0
- data/lib/pubid/amca/scheme.rb +16 -0
- data/lib/pubid/amca/single_identifier.rb +33 -0
- data/lib/pubid/amca/urn_generator.rb +50 -0
- data/lib/pubid/amca.rb +26 -0
- data/lib/pubid/ansi/builder.rb +52 -0
- data/lib/pubid/ansi/identifier.rb +55 -0
- data/lib/pubid/ansi/identifiers/american_national_standard.rb +12 -0
- data/lib/pubid/ansi/identifiers/standard.rb +16 -0
- data/lib/pubid/ansi/identifiers.rb +11 -0
- data/lib/pubid/ansi/parser.rb +91 -0
- data/lib/pubid/ansi/scheme.rb +15 -0
- data/lib/pubid/ansi/single_identifier.rb +45 -0
- data/lib/pubid/ansi/urn_generator.rb +76 -0
- data/lib/pubid/ansi.rb +27 -0
- data/lib/pubid/api/builder.rb +85 -0
- data/lib/pubid/api/components/code.rb +9 -0
- data/lib/pubid/api/identifier.rb +68 -0
- data/lib/pubid/api/identifiers/base.rb +24 -0
- data/lib/pubid/api/identifiers/bulletin.rb +15 -0
- data/lib/pubid/api/identifiers/continuous_operations_standard.rb +15 -0
- data/lib/pubid/api/identifiers/mpms.rb +44 -0
- data/lib/pubid/api/identifiers/publication.rb +15 -0
- data/lib/pubid/api/identifiers/recommended_practice.rb +15 -0
- data/lib/pubid/api/identifiers/specification.rb +15 -0
- data/lib/pubid/api/identifiers/standard.rb +15 -0
- data/lib/pubid/api/identifiers/technical_report.rb +15 -0
- data/lib/pubid/api/identifiers/typeless_standard.rb +27 -0
- data/lib/pubid/api/parser.rb +140 -0
- data/lib/pubid/api/scheme.rb +66 -0
- data/lib/pubid/api/single_identifier.rb +46 -0
- data/lib/pubid/api/urn_generator.rb +41 -0
- data/lib/pubid/api.rb +17 -0
- data/lib/pubid/ashrae/builder.rb +498 -0
- data/lib/pubid/ashrae/identifier.rb +57 -0
- data/lib/pubid/ashrae/identifiers/addenda_package.rb +46 -0
- data/lib/pubid/ashrae/identifiers/addendum.rb +55 -0
- data/lib/pubid/ashrae/identifiers/base.rb +23 -0
- data/lib/pubid/ashrae/identifiers/combined_addenda.rb +51 -0
- data/lib/pubid/ashrae/identifiers/errata.rb +40 -0
- data/lib/pubid/ashrae/identifiers/guideline.rb +38 -0
- data/lib/pubid/ashrae/identifiers/interpretation.rb +39 -0
- data/lib/pubid/ashrae/identifiers/standard.rb +38 -0
- data/lib/pubid/ashrae/identifiers.rb +16 -0
- data/lib/pubid/ashrae/parser.rb +724 -0
- data/lib/pubid/ashrae/scheme.rb +53 -0
- data/lib/pubid/ashrae/single_identifier.rb +23 -0
- data/lib/pubid/ashrae/supplement_identifier.rb +23 -0
- data/lib/pubid/ashrae/urn_generator.rb +59 -0
- data/lib/pubid/ashrae.rb +21 -0
- data/lib/pubid/asme/builder.rb +153 -0
- data/lib/pubid/asme/components/code.rb +18 -0
- data/lib/pubid/asme/identifier.rb +61 -0
- data/lib/pubid/asme/identifiers/base.rb +70 -0
- data/lib/pubid/asme/identifiers/standard.rb +12 -0
- data/lib/pubid/asme/identifiers.rb +10 -0
- data/lib/pubid/asme/parser.rb +308 -0
- data/lib/pubid/asme/scheme.rb +37 -0
- data/lib/pubid/asme/single_identifier.rb +29 -0
- data/lib/pubid/asme/urn_generator.rb +133 -0
- data/lib/pubid/asme.rb +21 -0
- data/lib/pubid/astm/builder.rb +159 -0
- data/lib/pubid/astm/components/code.rb +33 -0
- data/lib/pubid/astm/identifier.rb +92 -0
- data/lib/pubid/astm/identifiers/adjunct.rb +21 -0
- data/lib/pubid/astm/identifiers/base.rb +13 -0
- data/lib/pubid/astm/identifiers/data_series.rb +25 -0
- data/lib/pubid/astm/identifiers/iso_dual_published.rb +74 -0
- data/lib/pubid/astm/identifiers/manual.rb +40 -0
- data/lib/pubid/astm/identifiers/monograph.rb +25 -0
- data/lib/pubid/astm/identifiers/research_report.rb +18 -0
- data/lib/pubid/astm/identifiers/standard.rb +52 -0
- data/lib/pubid/astm/identifiers/technical_report.rb +23 -0
- data/lib/pubid/astm/identifiers/work_in_progress.rb +21 -0
- data/lib/pubid/astm/parser.rb +244 -0
- data/lib/pubid/astm/scheme.rb +55 -0
- data/lib/pubid/astm/single_identifier.rb +25 -0
- data/lib/pubid/astm/urn_generator.rb +99 -0
- data/lib/pubid/astm.rb +38 -0
- data/lib/pubid/bsi/builder.rb +1483 -0
- data/lib/pubid/bsi/components/code.rb +11 -0
- data/lib/pubid/bsi/components/date.rb +11 -0
- data/lib/pubid/bsi/components/publisher.rb +11 -0
- data/lib/pubid/bsi/components/type.rb +11 -0
- data/lib/pubid/bsi/identifier.rb +87 -0
- data/lib/pubid/bsi/identifiers/addendum_document.rb +64 -0
- data/lib/pubid/bsi/identifiers/adopted_european_norm.rb +95 -0
- data/lib/pubid/bsi/identifiers/adopted_international_standard.rb +82 -0
- data/lib/pubid/bsi/identifiers/aerospace_standard.rb +118 -0
- data/lib/pubid/bsi/identifiers/amendment.rb +40 -0
- data/lib/pubid/bsi/identifiers/base.rb +11 -0
- data/lib/pubid/bsi/identifiers/british_industrial_practice.rb +27 -0
- data/lib/pubid/bsi/identifiers/british_standard.rb +33 -0
- data/lib/pubid/bsi/identifiers/bundled_identifier.rb +114 -0
- data/lib/pubid/bsi/identifiers/committee_document.rb +51 -0
- data/lib/pubid/bsi/identifiers/consolidated_identifier.rb +152 -0
- data/lib/pubid/bsi/identifiers/corrigendum.rb +28 -0
- data/lib/pubid/bsi/identifiers/detailed_specification.rb +69 -0
- data/lib/pubid/bsi/identifiers/disc.rb +56 -0
- data/lib/pubid/bsi/identifiers/draft_document.rb +71 -0
- data/lib/pubid/bsi/identifiers/electronic_book.rb +52 -0
- data/lib/pubid/bsi/identifiers/expert_commentary.rb +47 -0
- data/lib/pubid/bsi/identifiers/explanatory_supplement.rb +82 -0
- data/lib/pubid/bsi/identifiers/flex.rb +61 -0
- data/lib/pubid/bsi/identifiers/handbook.rb +39 -0
- data/lib/pubid/bsi/identifiers/index.rb +62 -0
- data/lib/pubid/bsi/identifiers/method.rb +76 -0
- data/lib/pubid/bsi/identifiers/national_annex.rb +73 -0
- data/lib/pubid/bsi/identifiers/practice_guide.rb +27 -0
- data/lib/pubid/bsi/identifiers/publicly_available_specification.rb +79 -0
- data/lib/pubid/bsi/identifiers/published_document.rb +79 -0
- data/lib/pubid/bsi/identifiers/section.rb +62 -0
- data/lib/pubid/bsi/identifiers/set.rb +46 -0
- data/lib/pubid/bsi/identifiers/standalone_amendment.rb +40 -0
- data/lib/pubid/bsi/identifiers/supplement_document.rb +51 -0
- data/lib/pubid/bsi/identifiers/supplementary_index.rb +81 -0
- data/lib/pubid/bsi/identifiers/technical_specification.rb +79 -0
- data/lib/pubid/bsi/identifiers/test_method.rb +67 -0
- data/lib/pubid/bsi/identifiers/value_added_publication.rb +52 -0
- data/lib/pubid/bsi/identifiers.rb +52 -0
- data/lib/pubid/bsi/model.rb +196 -0
- data/lib/pubid/bsi/parser.rb +659 -0
- data/lib/pubid/bsi/scheme.rb +243 -0
- data/lib/pubid/bsi/single_identifier.rb +129 -0
- data/lib/pubid/bsi/urn_generator.rb +84 -0
- data/lib/pubid/bsi.rb +32 -0
- data/lib/pubid/builder/base.rb +138 -0
- data/lib/pubid/bundled_identifier.rb +126 -0
- data/lib/pubid/ccsds/builder.rb +56 -0
- data/lib/pubid/ccsds/identifier.rb +84 -0
- data/lib/pubid/ccsds/identifiers/base.rb +89 -0
- data/lib/pubid/ccsds/identifiers/base_BASE_88929.rb +70 -0
- data/lib/pubid/ccsds/identifiers/corrigendum.rb +39 -0
- data/lib/pubid/ccsds/identifiers.rb +10 -0
- data/lib/pubid/ccsds/parser.rb +71 -0
- data/lib/pubid/ccsds/scheme.rb +57 -0
- data/lib/pubid/ccsds/single_identifier.rb +77 -0
- data/lib/pubid/ccsds/supplement_identifier.rb +33 -0
- data/lib/pubid/ccsds/urn_generator.rb +115 -0
- data/lib/pubid/ccsds.rb +21 -0
- data/lib/pubid/cen_cenelec/builder.rb +330 -0
- data/lib/pubid/cen_cenelec/identifier.rb +52 -0
- data/lib/pubid/cen_cenelec/identifiers/adopted_european_norm.rb +40 -0
- data/lib/pubid/cen_cenelec/identifiers/amendment.rb +29 -0
- data/lib/pubid/cen_cenelec/identifiers/base.rb +75 -0
- data/lib/pubid/cen_cenelec/identifiers/cen_report.rb +28 -0
- data/lib/pubid/cen_cenelec/identifiers/cen_workshop_agreement.rb +27 -0
- data/lib/pubid/cen_cenelec/identifiers/cenelec_harmonization_document.rb +28 -0
- data/lib/pubid/cen_cenelec/identifiers/consolidated_identifier.rb +61 -0
- data/lib/pubid/cen_cenelec/identifiers/corrigendum.rb +35 -0
- data/lib/pubid/cen_cenelec/identifiers/european_norm.rb +41 -0
- data/lib/pubid/cen_cenelec/identifiers/european_prestandard.rb +37 -0
- data/lib/pubid/cen_cenelec/identifiers/european_specification.rb +28 -0
- data/lib/pubid/cen_cenelec/identifiers/fragment.rb +22 -0
- data/lib/pubid/cen_cenelec/identifiers/guide.rb +27 -0
- data/lib/pubid/cen_cenelec/identifiers/harmonization_document.rb +27 -0
- data/lib/pubid/cen_cenelec/identifiers/technical_report.rb +27 -0
- data/lib/pubid/cen_cenelec/identifiers/technical_specification.rb +35 -0
- data/lib/pubid/cen_cenelec/identifiers.rb +32 -0
- data/lib/pubid/cen_cenelec/parser.rb +144 -0
- data/lib/pubid/cen_cenelec/scheme.rb +164 -0
- data/lib/pubid/cen_cenelec/single_identifier.rb +130 -0
- data/lib/pubid/cen_cenelec/supplement_identifier.rb +48 -0
- data/lib/pubid/cen_cenelec/urn_generator.rb +129 -0
- data/lib/pubid/cen_cenelec.rb +21 -0
- data/lib/pubid/cie/builder.rb +399 -0
- data/lib/pubid/cie/components/code.rb +72 -0
- data/lib/pubid/cie/components/language.rb +58 -0
- data/lib/pubid/cie/identifier.rb +71 -0
- data/lib/pubid/cie/identifiers/bundle.rb +20 -0
- data/lib/pubid/cie/identifiers/conference.rb +32 -0
- data/lib/pubid/cie/identifiers/corrigendum.rb +40 -0
- data/lib/pubid/cie/identifiers/dual_published.rb +41 -0
- data/lib/pubid/cie/identifiers/identical.rb +64 -0
- data/lib/pubid/cie/identifiers/joint_published.rb +52 -0
- data/lib/pubid/cie/identifiers/standard.rb +58 -0
- data/lib/pubid/cie/identifiers/supplement.rb +45 -0
- data/lib/pubid/cie/identifiers/tutorial_bundle.rb +20 -0
- data/lib/pubid/cie/identifiers.rb +17 -0
- data/lib/pubid/cie/parser.rb +347 -0
- data/lib/pubid/cie/scheme.rb +64 -0
- data/lib/pubid/cie/single_identifier.rb +30 -0
- data/lib/pubid/cie/supplement_identifier.rb +26 -0
- data/lib/pubid/cie/urn_generator.rb +123 -0
- data/lib/pubid/cie.rb +28 -0
- data/lib/pubid/components/code.rb +33 -0
- data/lib/pubid/components/date.rb +49 -0
- data/lib/pubid/components/edition.rb +32 -0
- data/lib/pubid/components/factory.rb +50 -0
- data/lib/pubid/components/language.rb +37 -0
- data/lib/pubid/components/locality.rb +10 -0
- data/lib/pubid/components/publisher.rb +36 -0
- data/lib/pubid/components/stage.rb +54 -0
- data/lib/pubid/components/type.rb +58 -0
- data/lib/pubid/components/typed_stage.rb +59 -0
- data/lib/pubid/components.rb +16 -0
- data/lib/pubid/core/pattern_doc_generator.rb +272 -0
- data/lib/pubid/core/update_codes.rb +77 -0
- data/lib/pubid/core.rb +8 -0
- data/lib/pubid/csa/builder.rb +671 -0
- data/lib/pubid/csa/components/code.rb +9 -0
- data/lib/pubid/csa/components.rb +9 -0
- data/lib/pubid/csa/composite_identifier.rb +27 -0
- data/lib/pubid/csa/identifier.rb +513 -0
- data/lib/pubid/csa/identifiers/base.rb +133 -0
- data/lib/pubid/csa/identifiers/bundled.rb +125 -0
- data/lib/pubid/csa/identifiers/canadian_adopted.rb +82 -0
- data/lib/pubid/csa/identifiers/cec.rb +129 -0
- data/lib/pubid/csa/identifiers/combined.rb +130 -0
- data/lib/pubid/csa/identifiers/csa_adopted.rb +78 -0
- data/lib/pubid/csa/identifiers/package.rb +65 -0
- data/lib/pubid/csa/identifiers/series.rb +127 -0
- data/lib/pubid/csa/identifiers/standard.rb +10 -0
- data/lib/pubid/csa/identifiers.rb +17 -0
- data/lib/pubid/csa/parser.rb +445 -0
- data/lib/pubid/csa/scheme.rb +44 -0
- data/lib/pubid/csa/single_identifier.rb +30 -0
- data/lib/pubid/csa/urn_generator.rb +80 -0
- data/lib/pubid/csa/wrapper_identifier.rb +31 -0
- data/lib/pubid/csa.rb +25 -0
- data/lib/pubid/etsi/builder.rb +133 -0
- data/lib/pubid/etsi/components/code.rb +42 -0
- data/lib/pubid/etsi/components/version.rb +37 -0
- data/lib/pubid/etsi/components.rb +10 -0
- data/lib/pubid/etsi/identifier.rb +57 -0
- data/lib/pubid/etsi/identifiers/amendment.rb +15 -0
- data/lib/pubid/etsi/identifiers/base.rb +38 -0
- data/lib/pubid/etsi/identifiers/corrigendum.rb +15 -0
- data/lib/pubid/etsi/identifiers/etsi_standard.rb +19 -0
- data/lib/pubid/etsi/identifiers/supplement_identifier.rb +91 -0
- data/lib/pubid/etsi/identifiers.rb +14 -0
- data/lib/pubid/etsi/parser.rb +133 -0
- data/lib/pubid/etsi/scheme.rb +42 -0
- data/lib/pubid/etsi/urn_generator.rb +76 -0
- data/lib/pubid/etsi.rb +21 -0
- data/lib/pubid/export/auditor.rb +89 -0
- data/lib/pubid/export/data_class_exporter.rb +59 -0
- data/lib/pubid/export/exporter.rb +74 -0
- data/lib/pubid/export/flavor_exporter.rb +402 -0
- data/lib/pubid/export/ieee_exporter.rb +78 -0
- data/lib/pubid/export/itu_exporter.rb +66 -0
- data/lib/pubid/export/nist_exporter.rb +64 -0
- data/lib/pubid/export/registry_exporter.rb +90 -0
- data/lib/pubid/export/result.rb +97 -0
- data/lib/pubid/export/scheme_exporter.rb +70 -0
- data/lib/pubid/export.rb +18 -0
- data/lib/pubid/format_detector.rb +16 -0
- data/lib/pubid/format_registry.rb +42 -0
- data/lib/pubid/identifier.rb +242 -0
- data/lib/pubid/identifier_metadata.rb +148 -0
- data/lib/pubid/identifier_registry.rb +198 -0
- data/lib/pubid/idf/builder.rb +82 -0
- data/lib/pubid/idf/identifier.rb +129 -0
- data/lib/pubid/idf/identifiers/amendment.rb +27 -0
- data/lib/pubid/idf/identifiers/corrigendum.rb +27 -0
- data/lib/pubid/idf/identifiers/international_standard.rb +123 -0
- data/lib/pubid/idf/identifiers/reviewed_method.rb +100 -0
- data/lib/pubid/idf/identifiers.rb +13 -0
- data/lib/pubid/idf/parser.rb +143 -0
- data/lib/pubid/idf/scheme.rb +61 -0
- data/lib/pubid/idf/single_identifier.rb +19 -0
- data/lib/pubid/idf/supplement_identifier.rb +43 -0
- data/lib/pubid/idf/urn_generator.rb +84 -0
- data/lib/pubid/idf.rb +25 -0
- data/lib/pubid/iec/builder.rb +458 -0
- data/lib/pubid/iec/components/code.rb +60 -0
- data/lib/pubid/iec/components/consolidated_amendment.rb +59 -0
- data/lib/pubid/iec/components/publisher.rb +36 -0
- data/lib/pubid/iec/components/sheet.rb +32 -0
- data/lib/pubid/iec/components/trf_info.rb +38 -0
- data/lib/pubid/iec/components/vap_suffix.rb +41 -0
- data/lib/pubid/iec/identifier.rb +256 -0
- data/lib/pubid/iec/identifiers/amendment.rb +94 -0
- data/lib/pubid/iec/identifiers/base.rb +82 -0
- data/lib/pubid/iec/identifiers/component_specification.rb +39 -0
- data/lib/pubid/iec/identifiers/conformity_assessment.rb +39 -0
- data/lib/pubid/iec/identifiers/consolidated_identifier.rb +82 -0
- data/lib/pubid/iec/identifiers/corrigendum.rb +94 -0
- data/lib/pubid/iec/identifiers/fragment_identifier.rb +137 -0
- data/lib/pubid/iec/identifiers/guide.rb +104 -0
- data/lib/pubid/iec/identifiers/international_standard.rb +147 -0
- data/lib/pubid/iec/identifiers/interpretation_sheet.rb +104 -0
- data/lib/pubid/iec/identifiers/operational_document.rb +39 -0
- data/lib/pubid/iec/identifiers/publicly_available_specification.rb +101 -0
- data/lib/pubid/iec/identifiers/sheet_identifier.rb +62 -0
- data/lib/pubid/iec/identifiers/societal_technology_trend_report.rb +40 -0
- data/lib/pubid/iec/identifiers/systems_reference_document.rb +40 -0
- data/lib/pubid/iec/identifiers/technical_report.rb +132 -0
- data/lib/pubid/iec/identifiers/technical_specification.rb +132 -0
- data/lib/pubid/iec/identifiers/technology_report.rb +39 -0
- data/lib/pubid/iec/identifiers/test_report_form.rb +78 -0
- data/lib/pubid/iec/identifiers/vap_identifier.rb +73 -0
- data/lib/pubid/iec/identifiers/white_paper.rb +39 -0
- data/lib/pubid/iec/identifiers/working_document.rb +96 -0
- data/lib/pubid/iec/parser.rb +417 -0
- data/lib/pubid/iec/rendering_style.rb +113 -0
- data/lib/pubid/iec/scheme.rb +71 -0
- data/lib/pubid/iec/single_identifier.rb +80 -0
- data/lib/pubid/iec/supplement_identifier.rb +161 -0
- data/lib/pubid/iec/urn_generator.rb +79 -0
- data/lib/pubid/iec/urn_parser.rb +90 -0
- data/lib/pubid/iec.rb +85 -0
- data/lib/pubid/ieee/aiee/builder.rb +71 -0
- data/lib/pubid/ieee/aiee/identifier.rb +105 -0
- data/lib/pubid/ieee/aiee/parser.rb +130 -0
- data/lib/pubid/ieee/aiee.rb +11 -0
- data/lib/pubid/ieee/builder.rb +1237 -0
- data/lib/pubid/ieee/components/code.rb +102 -0
- data/lib/pubid/ieee/components/draft.rb +93 -0
- data/lib/pubid/ieee/components/relationship.rb +157 -0
- data/lib/pubid/ieee/components/typed_stage.rb +100 -0
- data/lib/pubid/ieee/identifier.rb +54 -0
- data/lib/pubid/ieee/identifiers/adopted_standard.rb +33 -0
- data/lib/pubid/ieee/identifiers/base.rb +591 -0
- data/lib/pubid/ieee/identifiers/conformance_identifier.rb +35 -0
- data/lib/pubid/ieee/identifiers/corrigendum.rb +37 -0
- data/lib/pubid/ieee/identifiers/csa_dual_published.rb +51 -0
- data/lib/pubid/ieee/identifiers/dual_identifier.rb +18 -0
- data/lib/pubid/ieee/identifiers/dual_published.rb +28 -0
- data/lib/pubid/ieee/identifiers/iec_ieee_copublished.rb +27 -0
- data/lib/pubid/ieee/identifiers/interpretation_identifier.rb +34 -0
- data/lib/pubid/ieee/identifiers/joint_development.rb +172 -0
- data/lib/pubid/ieee/identifiers/multi_numbered_identifier.rb +51 -0
- data/lib/pubid/ieee/identifiers/nesc/base.rb +56 -0
- data/lib/pubid/ieee/identifiers/nesc/draft.rb +28 -0
- data/lib/pubid/ieee/identifiers/nesc/handbook.rb +32 -0
- data/lib/pubid/ieee/identifiers/nesc/redline.rb +26 -0
- data/lib/pubid/ieee/identifiers/nesc/standard.rb +26 -0
- data/lib/pubid/ieee/identifiers/nesc.rb +15 -0
- data/lib/pubid/ieee/identifiers/parenthetical_identifier.rb +20 -0
- data/lib/pubid/ieee/identifiers/project_draft_identifier.rb +26 -0
- data/lib/pubid/ieee/identifiers/redlined_standard.rb +33 -0
- data/lib/pubid/ieee/identifiers/si_standard.rb +73 -0
- data/lib/pubid/ieee/identifiers/standard.rb +41 -0
- data/lib/pubid/ieee/identifiers/supplement_identifier.rb +23 -0
- data/lib/pubid/ieee/identifiers.rb +33 -0
- data/lib/pubid/ieee/ire/builder.rb +61 -0
- data/lib/pubid/ieee/ire/identifier.rb +58 -0
- data/lib/pubid/ieee/ire/parser.rb +91 -0
- data/lib/pubid/ieee/ire.rb +11 -0
- data/lib/pubid/ieee/nesc/builder.rb +101 -0
- data/lib/pubid/ieee/nesc/parser.rb +154 -0
- data/lib/pubid/ieee/nesc.rb +10 -0
- data/lib/pubid/ieee/parser.rb +1226 -0
- data/lib/pubid/ieee/scheme.rb +90 -0
- data/lib/pubid/ieee/typed_stages.rb +172 -0
- data/lib/pubid/ieee/urn_generator.rb +188 -0
- data/lib/pubid/ieee.rb +32 -0
- data/lib/pubid/ieee_debug.rb +31 -0
- data/lib/pubid/iho/builder.rb +37 -0
- data/lib/pubid/iho/identifier.rb +61 -0
- data/lib/pubid/iho/identifiers/base.rb +41 -0
- data/lib/pubid/iho/identifiers/bibliographic.rb +16 -0
- data/lib/pubid/iho/identifiers/circular_letter.rb +15 -0
- data/lib/pubid/iho/identifiers/miscellaneous.rb +16 -0
- data/lib/pubid/iho/identifiers/publication.rb +15 -0
- data/lib/pubid/iho/identifiers/standard.rb +15 -0
- data/lib/pubid/iho/identifiers.rb +14 -0
- data/lib/pubid/iho/parser.rb +68 -0
- data/lib/pubid/iho/scheme.rb +29 -0
- data/lib/pubid/iho/urn_generator.rb +29 -0
- data/lib/pubid/iho.rb +21 -0
- data/lib/pubid/iso/builder.rb +309 -0
- data/lib/pubid/iso/bundled_identifier.rb +85 -0
- data/lib/pubid/iso/combined_identifier.rb +22 -0
- data/lib/pubid/iso/components/code.rb +36 -0
- data/lib/pubid/iso/components/publisher.rb +60 -0
- data/lib/pubid/iso/components.rb +12 -0
- data/lib/pubid/iso/format_resolver.rb +45 -0
- data/lib/pubid/iso/identifier.rb +330 -0
- data/lib/pubid/iso/identifiers/addendum.rb +104 -0
- data/lib/pubid/iso/identifiers/amendment.rb +128 -0
- data/lib/pubid/iso/identifiers/base.rb +115 -0
- data/lib/pubid/iso/identifiers/corrigendum.rb +108 -0
- data/lib/pubid/iso/identifiers/data.rb +76 -0
- data/lib/pubid/iso/identifiers/directives.rb +59 -0
- data/lib/pubid/iso/identifiers/directives_supplement.rb +119 -0
- data/lib/pubid/iso/identifiers/extract.rb +30 -0
- data/lib/pubid/iso/identifiers/guide.rb +100 -0
- data/lib/pubid/iso/identifiers/international_standard.rb +168 -0
- data/lib/pubid/iso/identifiers/international_standardized_profile.rb +94 -0
- data/lib/pubid/iso/identifiers/international_workshop_agreement.rb +89 -0
- data/lib/pubid/iso/identifiers/pas.rb +93 -0
- data/lib/pubid/iso/identifiers/recommendation.rb +45 -0
- data/lib/pubid/iso/identifiers/supplement.rb +87 -0
- data/lib/pubid/iso/identifiers/tc_document.rb +108 -0
- data/lib/pubid/iso/identifiers/technical_report.rb +103 -0
- data/lib/pubid/iso/identifiers/technical_specification.rb +102 -0
- data/lib/pubid/iso/identifiers/technology_trends_assessments.rb +95 -0
- data/lib/pubid/iso/identifiers.rb +33 -0
- data/lib/pubid/iso/parser.rb +512 -0
- data/lib/pubid/iso/rendering_style.rb +120 -0
- data/lib/pubid/iso/scheme.rb +193 -0
- data/lib/pubid/iso/single_identifier.rb +64 -0
- data/lib/pubid/iso/supplement_identifier.rb +27 -0
- data/lib/pubid/iso/urn_generator.rb +426 -0
- data/lib/pubid/iso/urn_parser.rb +437 -0
- data/lib/pubid/iso/utilities.rb +86 -0
- data/lib/pubid/iso.rb +50 -0
- data/lib/pubid/itu/builder.rb +171 -0
- data/lib/pubid/itu/components/code.rb +39 -0
- data/lib/pubid/itu/components/sector.rb +35 -0
- data/lib/pubid/itu/components/series.rb +29 -0
- data/lib/pubid/itu/i18n.rb +9 -0
- data/lib/pubid/itu/i18n.yaml +30 -0
- data/lib/pubid/itu/identifier.rb +118 -0
- data/lib/pubid/itu/identifiers/amendment.rb +43 -0
- data/lib/pubid/itu/identifiers/annex.rb +74 -0
- data/lib/pubid/itu/identifiers/base.rb +154 -0
- data/lib/pubid/itu/identifiers/combined_identifier.rb +47 -0
- data/lib/pubid/itu/identifiers/corrigendum.rb +44 -0
- data/lib/pubid/itu/identifiers/recommendation.rb +16 -0
- data/lib/pubid/itu/identifiers/special_publication.rb +31 -0
- data/lib/pubid/itu/identifiers/supplement.rb +46 -0
- data/lib/pubid/itu/identifiers.rb +16 -0
- data/lib/pubid/itu/model.rb +111 -0
- data/lib/pubid/itu/parser.rb +225 -0
- data/lib/pubid/itu/scheme.rb +174 -0
- data/lib/pubid/itu/urn_generator.rb +105 -0
- data/lib/pubid/itu.rb +22 -0
- data/lib/pubid/jcgm/builder.rb +88 -0
- data/lib/pubid/jcgm/components/publisher.rb +20 -0
- data/lib/pubid/jcgm/components.rb +9 -0
- data/lib/pubid/jcgm/identifier.rb +54 -0
- data/lib/pubid/jcgm/identifiers/amendment.rb +35 -0
- data/lib/pubid/jcgm/identifiers/guide.rb +21 -0
- data/lib/pubid/jcgm/identifiers/gum_guide.rb +51 -0
- data/lib/pubid/jcgm/identifiers.rb +11 -0
- data/lib/pubid/jcgm/parser.rb +84 -0
- data/lib/pubid/jcgm/scheme.rb +60 -0
- data/lib/pubid/jcgm/single_identifier.rb +48 -0
- data/lib/pubid/jcgm/supplement_identifier.rb +16 -0
- data/lib/pubid/jcgm/urn_generator.rb +110 -0
- data/lib/pubid/jcgm.rb +31 -0
- data/lib/pubid/jis/builder.rb +124 -0
- data/lib/pubid/jis/components/code.rb +59 -0
- data/lib/pubid/jis/components.rb +9 -0
- data/lib/pubid/jis/identifier.rb +61 -0
- data/lib/pubid/jis/identifiers/amendment.rb +16 -0
- data/lib/pubid/jis/identifiers/base.rb +72 -0
- data/lib/pubid/jis/identifiers/explanation.rb +22 -0
- data/lib/pubid/jis/identifiers/japanese_industrial_standard.rb +16 -0
- data/lib/pubid/jis/identifiers/standard.rb +27 -0
- data/lib/pubid/jis/identifiers/technical_report.rb +31 -0
- data/lib/pubid/jis/identifiers/technical_specification.rb +31 -0
- data/lib/pubid/jis/identifiers.rb +17 -0
- data/lib/pubid/jis/parser.rb +109 -0
- data/lib/pubid/jis/scheme.rb +49 -0
- data/lib/pubid/jis/single_identifier.rb +37 -0
- data/lib/pubid/jis/supplement_identifier.rb +47 -0
- data/lib/pubid/jis/urn_generator.rb +25 -0
- data/lib/pubid/jis.rb +23 -0
- data/lib/pubid/lutaml/no_store_registration.rb +30 -0
- data/lib/pubid/nist/builder.rb +2269 -0
- data/lib/pubid/nist/components/code.rb +38 -0
- data/lib/pubid/nist/components/edition.rb +134 -0
- data/lib/pubid/nist/components/issue_number.rb +28 -0
- data/lib/pubid/nist/components/part.rb +77 -0
- data/lib/pubid/nist/components/publisher.rb +24 -0
- data/lib/pubid/nist/components/stage.rb +53 -0
- data/lib/pubid/nist/components/supplement.rb +188 -0
- data/lib/pubid/nist/components/translation.rb +42 -0
- data/lib/pubid/nist/components/update.rb +103 -0
- data/lib/pubid/nist/components/version.rb +35 -0
- data/lib/pubid/nist/components/volume.rb +32 -0
- data/lib/pubid/nist/components.rb +19 -0
- data/lib/pubid/nist/configuration.rb +77 -0
- data/lib/pubid/nist/identifier.rb +62 -0
- data/lib/pubid/nist/identifiers/base.rb +578 -0
- data/lib/pubid/nist/identifiers/circular.rb +68 -0
- data/lib/pubid/nist/identifiers/circular_supplement.rb +50 -0
- data/lib/pubid/nist/identifiers/commercial_standard.rb +41 -0
- data/lib/pubid/nist/identifiers/commercial_standard_emergency.rb +56 -0
- data/lib/pubid/nist/identifiers/commercial_standards_monthly.rb +56 -0
- data/lib/pubid/nist/identifiers/crpl_report.rb +132 -0
- data/lib/pubid/nist/identifiers/federal_information_processing_standards.rb +104 -0
- data/lib/pubid/nist/identifiers/grant_contractor_report.rb +35 -0
- data/lib/pubid/nist/identifiers/handbook.rb +50 -0
- data/lib/pubid/nist/identifiers/internal_report.rb +56 -0
- data/lib/pubid/nist/identifiers/letter_circular.rb +45 -0
- data/lib/pubid/nist/identifiers/miscellaneous_publication.rb +65 -0
- data/lib/pubid/nist/identifiers/monograph.rb +69 -0
- data/lib/pubid/nist/identifiers/ncstar.rb +41 -0
- data/lib/pubid/nist/identifiers/nsrds.rb +41 -0
- data/lib/pubid/nist/identifiers/owmwp.rb +35 -0
- data/lib/pubid/nist/identifiers/report.rb +67 -0
- data/lib/pubid/nist/identifiers/special_publication.rb +36 -0
- data/lib/pubid/nist/identifiers/technical_note.rb +90 -0
- data/lib/pubid/nist/identifiers.rb +33 -0
- data/lib/pubid/nist/parser.rb +1117 -0
- data/lib/pubid/nist/scheme.rb +199 -0
- data/lib/pubid/nist/supplement_identifier.rb +67 -0
- data/lib/pubid/nist/urn_generator.rb +133 -0
- data/lib/pubid/nist.rb +37 -0
- data/lib/pubid/oiml/builder.rb +189 -0
- data/lib/pubid/oiml/components/code.rb +20 -0
- data/lib/pubid/oiml/components.rb +9 -0
- data/lib/pubid/oiml/identifier.rb +61 -0
- data/lib/pubid/oiml/identifiers/amendment.rb +13 -0
- data/lib/pubid/oiml/identifiers/annex.rb +62 -0
- data/lib/pubid/oiml/identifiers/base.rb +36 -0
- data/lib/pubid/oiml/identifiers/basic_publication.rb +13 -0
- data/lib/pubid/oiml/identifiers/document.rb +13 -0
- data/lib/pubid/oiml/identifiers/expert_report.rb +13 -0
- data/lib/pubid/oiml/identifiers/guide.rb +13 -0
- data/lib/pubid/oiml/identifiers/recommendation.rb +13 -0
- data/lib/pubid/oiml/identifiers/seminar_report.rb +13 -0
- data/lib/pubid/oiml/identifiers/vocabulary.rb +13 -0
- data/lib/pubid/oiml/identifiers.rb +18 -0
- data/lib/pubid/oiml/parser.rb +173 -0
- data/lib/pubid/oiml/scheme.rb +46 -0
- data/lib/pubid/oiml/single_identifier.rb +90 -0
- data/lib/pubid/oiml/supplement_identifier.rb +43 -0
- data/lib/pubid/oiml/urn_generator.rb +64 -0
- data/lib/pubid/oiml.rb +26 -0
- data/lib/pubid/parser/common_parse_methods.rb +13 -0
- data/lib/pubid/parser/common_parse_rules.rb +56 -0
- data/lib/pubid/parser.rb +8 -0
- data/lib/pubid/parsers/base.rb +11 -0
- data/lib/pubid/parsers/mr_string.rb +93 -0
- data/lib/pubid/plateau/builder.rb +50 -0
- data/lib/pubid/plateau/identifier.rb +57 -0
- data/lib/pubid/plateau/identifiers/annex.rb +16 -0
- data/lib/pubid/plateau/identifiers/base.rb +51 -0
- data/lib/pubid/plateau/identifiers/handbook.rb +34 -0
- data/lib/pubid/plateau/identifiers/technical_report.rb +20 -0
- data/lib/pubid/plateau/identifiers.rb +12 -0
- data/lib/pubid/plateau/parser.rb +63 -0
- data/lib/pubid/plateau/scheme.rb +45 -0
- data/lib/pubid/plateau/supplement_identifier.rb +72 -0
- data/lib/pubid/plateau/urn_generator.rb +29 -0
- data/lib/pubid/plateau.rb +26 -0
- data/lib/pubid/renderers/base.rb +53 -0
- data/lib/pubid/renderers/directives_renderer.rb +61 -0
- data/lib/pubid/renderers/guide_renderer.rb +24 -0
- data/lib/pubid/renderers/human_readable.rb +70 -0
- data/lib/pubid/renderers/iwa_renderer.rb +20 -0
- data/lib/pubid/renderers/mr_string.rb +16 -0
- data/lib/pubid/renderers/supplement_renderer.rb +36 -0
- data/lib/pubid/renderers/urn.rb +11 -0
- data/lib/pubid/renderers.rb +14 -0
- data/lib/pubid/rendering/base.rb +73 -0
- data/lib/pubid/rendering/common.rb +211 -0
- data/lib/pubid/rendering/context.rb +159 -0
- data/lib/pubid/rendering/date.rb +27 -0
- data/lib/pubid/rendering/format.rb +25 -0
- data/lib/pubid/rendering/language.rb +21 -0
- data/lib/pubid/rendering/numbering.rb +24 -0
- data/lib/pubid/rendering/publisher.rb +25 -0
- data/lib/pubid/rendering/stage.rb +38 -0
- data/lib/pubid/rendering/supplement.rb +46 -0
- data/lib/pubid/rendering.rb +16 -0
- data/lib/pubid/sae/builder.rb +32 -0
- data/lib/pubid/sae/components/code.rb +9 -0
- data/lib/pubid/sae/components/date.rb +19 -0
- data/lib/pubid/sae/components/type.rb +19 -0
- data/lib/pubid/sae/components.rb +11 -0
- data/lib/pubid/sae/identifier.rb +37 -0
- data/lib/pubid/sae/identifiers/base.rb +42 -0
- data/lib/pubid/sae/identifiers.rb +9 -0
- data/lib/pubid/sae/parser.rb +55 -0
- data/lib/pubid/sae/scheme.rb +47 -0
- data/lib/pubid/sae/urn_generator.rb +38 -0
- data/lib/pubid/sae.rb +19 -0
- data/lib/pubid/scheme.rb +219 -0
- data/lib/pubid/urn_generator/base.rb +110 -0
- data/lib/pubid/utils/string_normalizer.rb +196 -0
- data/lib/pubid/utils.rb +7 -0
- data/lib/pubid/version.rb +3 -1
- data/lib/pubid.rb +137 -13
- data/lib/tasks/docs.rake +37 -0
- data/lib/tasks/export.rake +38 -0
- data/lib/tasks/website-data.json +7488 -0
- metadata +616 -171
- data/lib/pubid/registry.rb +0 -30
|
@@ -0,0 +1,1117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "parslet"
|
|
4
|
+
|
|
5
|
+
module Pubid
|
|
6
|
+
module Nist
|
|
7
|
+
# Parser class for NIST identifiers
|
|
8
|
+
# Single Responsibility: Parsing NIST identifier syntax
|
|
9
|
+
class Parser < Parslet::Parser
|
|
10
|
+
# Class-level parse method with preprocessing
|
|
11
|
+
# Handles data quality normalization before parsing
|
|
12
|
+
# Named explicitly to avoid conflict with Parslet's built-in parse method
|
|
13
|
+
def self.class_parse_with_preprocessing(input)
|
|
14
|
+
# Apply legacy update_codes normalization first, before any other preprocessing
|
|
15
|
+
cleaned = Core::UpdateCodes.apply(input.to_s.strip, :nist)
|
|
16
|
+
|
|
17
|
+
# Fix lowercase publisher at start
|
|
18
|
+
cleaned = cleaned.sub(/^nbs\b/i, "NBS")
|
|
19
|
+
cleaned = cleaned.sub(/^nist\b/i, "NIST")
|
|
20
|
+
|
|
21
|
+
# Fix publisher+series concatenation: "NISTIR" → "NIST IR", "NBSIR" → "NBS IR"
|
|
22
|
+
# Must come after lowercase publisher fix to catch "nistir" → "NISTIR" → "NIST IR"
|
|
23
|
+
cleaned = cleaned.gsub(
|
|
24
|
+
/^(NBS|NIST)(IR|FIPS|GCR|HB|MONO|MP|NCSTAR|NSRDS)/i, '\1 \2'
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Fix lowercase series (ir, sp, tn, etc.)
|
|
28
|
+
cleaned = cleaned.sub(/\b(ir|sp|tn|hb|fips|ams|vts)\b/i, &:upcase)
|
|
29
|
+
|
|
30
|
+
# Normalize LC to LCIRC (single definition of truth)
|
|
31
|
+
# Pattern: "LC" followed by space/dot/end should become "LCIRC"
|
|
32
|
+
# But don't change if already "LCIRC"
|
|
33
|
+
cleaned = cleaned.gsub(/\bLC\b(?!IRC)/, "LCIRC")
|
|
34
|
+
|
|
35
|
+
# Combine "NBS LCIRC" with space into "NBS.LCIRC" ONLY when followed by supplement marker
|
|
36
|
+
# This allows the circ_supplement_identifier rule to match the pattern
|
|
37
|
+
# Only apply to supplement cases, not regular LCIRC identifiers
|
|
38
|
+
cleaned = cleaned.gsub(/\bNBS LCIRC\b(?=.*\b(?:supp?|sup\+|r\d+\/)\d)/,
|
|
39
|
+
"NBS.LCIRC")
|
|
40
|
+
|
|
41
|
+
# Convert MR format LCIRC supplements to space-separated format
|
|
42
|
+
# "NBS.LCIRC.145r11/1925" → "NBS LCIRC 145r11/1925" (convert series dot to space)
|
|
43
|
+
cleaned = cleaned.gsub(/\bNBS\.LCIRC\.(\d+r\d+\/\d{4})/,
|
|
44
|
+
"NBS LCIRC \\1")
|
|
45
|
+
# Also handle without year: "NBS.LCIRC.145r11" → "NBS LCIRC 145r11"
|
|
46
|
+
cleaned = cleaned.gsub(/\bNBS\.LCIRC\.(\d+r\d+)\b/, "NBS LCIRC \\1")
|
|
47
|
+
|
|
48
|
+
# Fix Roman numerals: "1011-I-2" → keep as is, but fix spaces: "1011-I-2 0" → "1011-I-2.0"
|
|
49
|
+
cleaned = cleaned.gsub(/([-\d]+[IVX]+[-\d]+)\s+(\d+)/, '\1.\2')
|
|
50
|
+
|
|
51
|
+
# Fix rev without space: "126rev2013" → "126 rev2013" (separate number from rev+year)
|
|
52
|
+
# BUT preserve edition+revision patterns: "e2rev1908" stays as-is
|
|
53
|
+
cleaned = cleaned.gsub(/(?<!e)(\d)(rev\d{4})/, '\1 \2')
|
|
54
|
+
|
|
55
|
+
# Fix LCIRC revision with slash and year: "145r6/1925" → "145 r6/1925"
|
|
56
|
+
# BUT NOT for LCIRC series (keep "NBS LCIRC 145r11/1925" as-is for parser)
|
|
57
|
+
# The circ_supplement_identifier rule expects "145r11" (no space)
|
|
58
|
+
unless cleaned.include?("LCIRC") || cleaned.include?("CIRC")
|
|
59
|
+
cleaned = cleaned.gsub(/(\d)(r\d+\/\d{4})/, '\1 \2')
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Fix LCIRC revision with just year (no slash): "1128r1995" → "1128 r1995"
|
|
63
|
+
# BUT preserve edition+revision patterns: "13e2rev1908" stays as-is
|
|
64
|
+
# AND preserve month abbreviations in patterns like "107-Mar1985" (ar1985 contains 'r')
|
|
65
|
+
# Use word boundary to ensure 'r' is standalone, not part of a month name
|
|
66
|
+
# AND preserve "rv" (revision year) patterns: "1013rv1953" stays as-is
|
|
67
|
+
cleaned = cleaned.gsub(/\b(r(?!v)\d{4})\b/, ' \1')
|
|
68
|
+
|
|
69
|
+
# Fix month in revision: "4743rJun1992" → "4743 rJun1992" (NEW)
|
|
70
|
+
cleaned = cleaned.gsub(/(\d)(r[A-Z][a-z]{2,8}\d{4})/, '\1 \2')
|
|
71
|
+
# REMOVED: Revision with 1-2 digits + lowercase letter preprocessing
|
|
72
|
+
# This is now handled by the more comprehensive fix at lines 131-142
|
|
73
|
+
# which keeps "22r1a" together (no space) for second_number pattern matching
|
|
74
|
+
|
|
75
|
+
# CRITICAL: Normalize lowercase letter suffix to uppercase
|
|
76
|
+
# Fix dash-letter pattern: "6529-a" → "6529-A" (FIXED - was incorrect)
|
|
77
|
+
# BUT preserve lowercase for NCSTAR series when letter is followed by volume (e.g., "1-1av1")
|
|
78
|
+
cleaned = cleaned.gsub(/(\d)-([a-z])$/) { "#{$1}-#{$2.upcase}" }
|
|
79
|
+
|
|
80
|
+
# Fix direct letter suffix (no dash): "378g" → "378G", "1000a" → "1000A"
|
|
81
|
+
# MUST come after dash pattern to avoid conflicts
|
|
82
|
+
# Fix letter suffix at end: "1011-A" → "1011A", "97-3b" → "97-3B"
|
|
83
|
+
# CRITICAL: Exclude r+digit pattern (e.g., "73-197r", "6945r") from this conversion
|
|
84
|
+
# These should remain as lowercase for edition pattern matching
|
|
85
|
+
# Only match single letter at end, not part of words like "index", "sec", etc.
|
|
86
|
+
cleaned = cleaned.gsub(/(\d)([a-z&&[^r]])$/) { "#{$1}#{$2.upcase}" }
|
|
87
|
+
# Also fix r+letter patterns (e.g., "22r1a" → "22r1A") separately
|
|
88
|
+
cleaned = cleaned.gsub(/(\d)(r)(\d+)([a-z])$/) do
|
|
89
|
+
"#{$1}#{$2}#{$3}#{$4.upcase}"
|
|
90
|
+
end
|
|
91
|
+
# NEW: Fix letter suffix before r (e.g., "53ar1" → "53Ar1")
|
|
92
|
+
# For patterns like NIST SP 800-53ar1 where letter is between number and revision
|
|
93
|
+
cleaned = cleaned.gsub(/(\d)([a-z])(r\d)/) { "#{$1}#{$2.upcase}#{$3}" }
|
|
94
|
+
# NOTE: Removed uppercase letter before r rule - it was breaking 800-56Ar2 parsing
|
|
95
|
+
# The parser should handle 56Ar2 as a single unit (letter suffix + revision)
|
|
96
|
+
|
|
97
|
+
# Fix letter suffix before volume: "1-2bv1" → "1-2Bv1" (MR format)
|
|
98
|
+
# BUT preserve "rv" (revision year) patterns: "1013rv1953" stays as-is
|
|
99
|
+
# Skip for NCSTAR to preserve lowercase letters (patterns like "1-1av1" should stay lowercase)
|
|
100
|
+
is_ncstar = cleaned.include?("NCSTAR")
|
|
101
|
+
unless is_ncstar
|
|
102
|
+
cleaned = cleaned.gsub(/(\d)([a-z&&[^r]])(v\d+)/) do
|
|
103
|
+
"#{$1}#{$2.upcase}#{$3}"
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Fix space before volume number: "80-2073 2" → "80-2073 v2" (Session 219)
|
|
108
|
+
# This handles NBS IR 80-2073 2 and NBS IR 80-2073 3 as volume identifiers
|
|
109
|
+
cleaned = cleaned.gsub(/(\d{2}-\d{4})\s+(\d)$/, '\1 v\2')
|
|
110
|
+
|
|
111
|
+
# Fix draft with number: "8270-draft2" → "8270 -draft 2" (Session 253)
|
|
112
|
+
# Space BEFORE dash AND after draft to separate it from report_number
|
|
113
|
+
cleaned = cleaned.gsub(/(\d)-draft(\d)/, '\1 -draft \2')
|
|
114
|
+
|
|
115
|
+
# NEW FIX 2: Draft without dash: "8270draft2" → "8270 -draft 2"
|
|
116
|
+
# More lenient pattern to catch missing dash before draft
|
|
117
|
+
cleaned = cleaned.gsub(/(\d)draft(\d)/, '\1 -draft \2')
|
|
118
|
+
|
|
119
|
+
# Fix supplement typo: "154suprev" → "154supprev" (Session 219)
|
|
120
|
+
cleaned = cleaned.gsub(/(\d)suprev/, '\1supprev')
|
|
121
|
+
|
|
122
|
+
# Fix letter suffix + revision before draft: "140Cr1-draft2" → "140C r1-draft2" (Session 221)
|
|
123
|
+
# Must be BEFORE general draft preprocessing at line 47
|
|
124
|
+
cleaned = cleaned.gsub(/(\d{2,})([A-Z])(r\d+)([-\s]draft\d*)/,
|
|
125
|
+
'\1\2 \3\4')
|
|
126
|
+
|
|
127
|
+
# Convert Roman numeral volumes to Arabic per NIST spec (page 7)
|
|
128
|
+
# "1011-I-2.0" → "1011 v1 ver2.0"
|
|
129
|
+
# "1011-II-1.0" → "1011 v2 ver1.0"
|
|
130
|
+
cleaned = cleaned.gsub(/(\d+)-([IVX]+)-(\d+(?:\.\d+)*)/) do
|
|
131
|
+
number = $1
|
|
132
|
+
roman = $2
|
|
133
|
+
version_part = $3
|
|
134
|
+
|
|
135
|
+
# Convert Roman to Arabic
|
|
136
|
+
arabic = roman_to_arabic(roman)
|
|
137
|
+
|
|
138
|
+
# Convert to volume+version format
|
|
139
|
+
"#{number} v#{arabic} ver#{version_part}"
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Fix LCIRC supplement with slash and year: "118supp3/1926" → "118 supp3/1926"
|
|
143
|
+
cleaned = cleaned.gsub(/(\d)(supp\d+\/\d{4})/, '\1 \2')
|
|
144
|
+
|
|
145
|
+
# Fix Pt pattern: "800-57Pt3r1" → "800-57 pt3 r1"
|
|
146
|
+
cleaned = cleaned.gsub(/(\d)Pt(\d+)(r\d+)/, '\1 pt\2 \3')
|
|
147
|
+
|
|
148
|
+
# Fix version patterns: "ver1e2006" → "ver1 e2006", "ver2v1" → "ver2 v1"
|
|
149
|
+
cleaned = cleaned.gsub(/(\d)ver(\d)/, '\1 ver \2')
|
|
150
|
+
cleaned = cleaned.gsub(/ver(\d+)e(\d{4})/, 'ver\1 e\2')
|
|
151
|
+
cleaned = cleaned.gsub(/ver(\d+)v(\d+)/, 'ver\1 v\2')
|
|
152
|
+
|
|
153
|
+
# Fix dotted version: separate from number "268v1.1" → "268 v1.1"
|
|
154
|
+
cleaned = cleaned.gsub(/(\d)(v\d+\.\d+)/, '\1 \2')
|
|
155
|
+
|
|
156
|
+
# CRITICAL: Now separate dotted versions from preceding digits: "268v1.1" → "268 v1.1" (NEW)
|
|
157
|
+
cleaned = cleaned.gsub(/(\d)(v\d+\.\d+)/, '\1 \2')
|
|
158
|
+
|
|
159
|
+
# NEW: Separate version from number AND convert spaces to dots in one step
|
|
160
|
+
cleaned = cleaned.gsub(/(\d)(v\d+)\s+(\d+)$/, '\1 \2.\3') # Two-part: "268v1 1" → "268 v1.1"
|
|
161
|
+
cleaned = cleaned.gsub(/(\d)(v\d+)\s+(\d+)\s+(\d+)$/, '\1 \2.\3.\4') # Three-part: "63v1 0 1" → "63 v1.0.1"
|
|
162
|
+
|
|
163
|
+
# Fix volume ranges: "535v2a-l" → "535 v2a-l", "535v2m-z" → "535 v2m-z"
|
|
164
|
+
cleaned = cleaned.gsub(/(\d)(v\d+[a-z]-[a-z])/, '\1 \2')
|
|
165
|
+
|
|
166
|
+
# NEW: Fix volume with uppercase letter: "48v3B" → "48 v3B" (Session 220)
|
|
167
|
+
cleaned = cleaned.gsub(/(\d)(v\d+[A-Z])/, '\1 \2')
|
|
168
|
+
|
|
169
|
+
# NEW: Fix volume ranges with uppercase: "v2A-L" → "v2a-l" (normalize to lowercase) (Session 220)
|
|
170
|
+
cleaned = cleaned.gsub(/(v\d+)([A-Z])-([A-Z])/, '\1\2-\3'.downcase)
|
|
171
|
+
|
|
172
|
+
# NEW: Fix edition with "ed." suffix: "2006ed." → "e2006" (V1 compatibility)
|
|
173
|
+
# Pattern appears at end of identifier: "NIST SP 260-162 2006ed."
|
|
174
|
+
cleaned = cleaned.gsub(/(\d{4})ed\./, 'e\1')
|
|
175
|
+
|
|
176
|
+
# CRITICAL: Fix revision attached to number BEFORE update patterns!
|
|
177
|
+
# "8115r1-upd" → "8115 r1-upd" so that later "r1-upd" → "r1 -upd" works
|
|
178
|
+
# But preserve r6/1925 format (don't add space before slash/year)
|
|
179
|
+
# And preserve 300-8r1/upd format (don't separate r1/upd)
|
|
180
|
+
# ENHANCED: Also handle r1a (revision with letter suffix) - "800-22r1a" → "800-22r1A"
|
|
181
|
+
# FIXED: When there's a letter suffix, keep together for second_number pattern
|
|
182
|
+
# CRITICAL: Use \d{1,2} instead of \d+ to limit revision to 1-2 digits, allowing [a-z] to match
|
|
183
|
+
# First rule: Match r+digit+letter (keep together)
|
|
184
|
+
cleaned = cleaned.gsub(/(\d+)(r\d{1,2})([a-z])(?=-|[A-Z]|$)/) do
|
|
185
|
+
num = $1
|
|
186
|
+
rev = $2
|
|
187
|
+
letter = $3
|
|
188
|
+
# Keep together when there's a letter suffix
|
|
189
|
+
"#{num}#{rev}#{letter.upcase}"
|
|
190
|
+
end
|
|
191
|
+
# Second rule: Match r+digit WITHOUT letter suffix
|
|
192
|
+
# CRITICAL: Use negative lookahead (?![a-zA-Z]) to avoid matching when there's a letter
|
|
193
|
+
# PRESERVE compact format (no space) when at end of string (NIST SP 800-53r4)
|
|
194
|
+
# ADD space only when followed by: dash+uppercase, uppercase letter, or /upd, /errata, /insert
|
|
195
|
+
cleaned = cleaned.gsub(/(\d+)(r\d{1,2})(?![a-zA-Z])(?=[A-Z]|-(?=[A-Z])|\/(?:upd|errata|insert))/) do
|
|
196
|
+
num = $1
|
|
197
|
+
rev = $2
|
|
198
|
+
# Add space when followed by dash+uppercase, uppercase, or update keyword
|
|
199
|
+
"#{num} #{rev}"
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Fix spaces in version/volume numbers: "v1 1" → "v1.1", "1011-I-2 0" → "1011-I-2.0"
|
|
203
|
+
# ENHANCED to handle multiple spaces: "v1 0 1" → "v1.0.1", "v1 0 2" → "v1.0.2"
|
|
204
|
+
# FIXED: Pattern must start with "v" or digit to avoid matching "rev 2013" as "v" + " 2013"
|
|
205
|
+
# CRITICAL: Added word boundary \b to prevent matching "v" within "rev"
|
|
206
|
+
# CRITICAL FIX: Use \b to ensure match starts at word boundary
|
|
207
|
+
cleaned = cleaned.gsub(/(\b(?:v|\d)[v\d]*[-A-Z]*)\s+(\d+)(?!(?i:pd|wd|prd)\b)\s+(\d+)(?!(?i:pd|wd|prd)\b)/, '\1.\2.\3') # Three parts
|
|
208
|
+
# CRITICAL FIX: Use \b to ensure match starts at word boundary.
|
|
209
|
+
# Negative lookahead: don't swallow the digit of a numeric draft
|
|
210
|
+
# stage ("189 2pd" must stay split, not become "189.2pd"); letter
|
|
211
|
+
# stages ("ipd") already don't match the trailing \d+.
|
|
212
|
+
cleaned = cleaned.gsub(/(\b(?:v|\d)[v\d]*)\s+(\d+)(?!(?i:pd|wd|prd)\b)/, '\1.\2') # Two parts
|
|
213
|
+
|
|
214
|
+
# Fix update patterns: ensure space before -upd or /upd (not just at end)
|
|
215
|
+
# Enhanced to handle optional digits after upd: -upd, -upd1, /upd, /upd1
|
|
216
|
+
cleaned = cleaned.gsub(/(\d+)-upd(\d*)/, '\1 -upd\2') # -upd or -upd1
|
|
217
|
+
cleaned = cleaned.gsub(/(\d+)\/upd(\d*)/, '\1 /upd\2') # /upd or /upd1
|
|
218
|
+
cleaned = cleaned.gsub(/([a-z]\d+)-upd/, '\1 -upd') # r1-upd → r1 -upd
|
|
219
|
+
cleaned = cleaned.gsub(/([a-z]\d+)\/upd/, '\1 /upd') # After revision: r1/upd → r1 /upd
|
|
220
|
+
|
|
221
|
+
# NEW FIX 3: MR format with letter suffix before update: "8286C-upd1" → "8286C -upd1"
|
|
222
|
+
# Must handle uppercase letters before -upd in MR format
|
|
223
|
+
cleaned = cleaned.gsub(/(\d+[A-Z])-upd(\d*)/, '\1 -upd\2') # Letter suffix + update
|
|
224
|
+
cleaned = cleaned.gsub(/(\d+[A-Z])\/upd(\d*)/, '\1 /upd\2') # Letter suffix + /upd variant
|
|
225
|
+
|
|
226
|
+
# Fix supplement patterns: ensure space before supplement (1st variant)
|
|
227
|
+
# "118supp3" already handled at line 32-33, but add "sup" variant
|
|
228
|
+
cleaned = cleaned.gsub(/(\d)(sup\d)/, '\1 \2') # 100-2sup1 → 100-2 sup1
|
|
229
|
+
# Fix supplement patterns: ensure space before supplement (2nd variant)
|
|
230
|
+
cleaned = cleaned.gsub(/(\d)(sup+)(\d)/, '\1 \2\3') # 100-2sup+1 → 100-2 sup+1
|
|
231
|
+
# Fix supplement patterns: ensure space before supplement (3rd variant)
|
|
232
|
+
cleaned = cleaned.gsub(/(\d)(sup\+)(\d)/, '\1 \2\3') # 100-2sup+1 → 100-2 sup+1
|
|
233
|
+
# Fix supplement patterns: ensure space before supplement (4th variant)
|
|
234
|
+
cleaned = cleaned.gsub(/(\d)(sup\d+)/, '\1 \2') # 100-2sup1 → 100-2 sup1
|
|
235
|
+
# Fix supplement patterns: ensure space before supplement (5th variant)
|
|
236
|
+
cleaned = cleaned.gsub(/(\d)(sup\d+\b)/, '\1 \2') # 100-2sup1 → 100-2 sup1
|
|
237
|
+
|
|
238
|
+
# Fix letter suffix + supplement: "378Gsup" → "378Gsupp" (NEW for LCIRC patterns)
|
|
239
|
+
# Normalize "sup" to "supp" for letter suffix patterns to match circ_supplement_identifier rule
|
|
240
|
+
cleaned = cleaned.gsub(/(\d+[A-Z])sup(\b)/, '\1supp\2') # 378Gsup → 378Gsupp
|
|
241
|
+
|
|
242
|
+
# Fix LCIRC supplement without letter suffix: "118sup12/1926" → "118supp12/1926"
|
|
243
|
+
# Normalize "sup" to "supp" for LCIRC patterns to match circ_supplement_identifier rule
|
|
244
|
+
cleaned = cleaned.gsub(/(\d+)sup(\d+\/\d{4})/, '\1supp\2') # 118sup12/1926 → 118supp12/1926
|
|
245
|
+
|
|
246
|
+
# Unify dashed/undashed year supplements: "supp-YYYY" → "suppYYYY".
|
|
247
|
+
# A bare dash before a 4-digit year is not semantic — "25supp-1924" and
|
|
248
|
+
# "25supp1924" denote the same publication (the genuine edition marker is
|
|
249
|
+
# explicit "e", e.g. "25suppe1924"). Collapsing the dash here gives both
|
|
250
|
+
# spellings ONE parse tree (the normal first_number path), so they build
|
|
251
|
+
# to an identical Circular with supplement=<year>, with equal ==/URN.
|
|
252
|
+
# Guard: 4 digits NOT followed by another digit or a slash, so the
|
|
253
|
+
# dash-slash form "supp-12/1926" (supplement_dash_slash_year) is untouched.
|
|
254
|
+
cleaned = cleaned.gsub(/(\d)(supp?)-(\d{4})(?![\d\/])/, '\1\2\3') # 25supp-1924 → 25supp1924
|
|
255
|
+
|
|
256
|
+
# REMOVED: Revision letter patterns that add space before revision with letter
|
|
257
|
+
# These conflicted with the fix at lines 131-142 which keeps "22r1a" together
|
|
258
|
+
# for second_number pattern matching. The comprehensive fix now handles:
|
|
259
|
+
# - "800-22r1a" → "800-22r1A" (kept together, uppercase letter)
|
|
260
|
+
# - "800-22r1" → "800-22 r1" (space added when no letter suffix)
|
|
261
|
+
|
|
262
|
+
# Fix number with letter suffix followed by standalone 'r': "56ar" → "56a r" (NEW)
|
|
263
|
+
cleaned = cleaned.gsub(/(\d[a-z])r\b/, '\1 r')
|
|
264
|
+
|
|
265
|
+
# Fix revision followed by language code: "r1es" → "r1 es", "r1pt" → "r1 pt" (NEW)
|
|
266
|
+
cleaned = cleaned.gsub(/(r\d+)(es|pt|chi|viet|port|esp)\b/, '\1 \2')
|
|
267
|
+
|
|
268
|
+
# Fix MR format translation codes: ".spa" → " spa", ".por" → " por", ".ind" → " ind" (NEW)
|
|
269
|
+
# Prevents 3-letter translation codes from being parsed as letter suffixes
|
|
270
|
+
# "NIST.SP.1262.spa" → "NIST.SP.1262 spa" (convert dot to space)
|
|
271
|
+
cleaned = cleaned.gsub(/^([A-Z]+)\.SP\.(\d+)\.([a-z]{2,4})$/,
|
|
272
|
+
'\1.SP.\2 \3')
|
|
273
|
+
cleaned = cleaned.gsub(/^([A-Z]+)\.([A-Z]+)\.(\d+)\.([a-z]{2,4})$/,
|
|
274
|
+
'\1.\2.\3 \4')
|
|
275
|
+
|
|
276
|
+
# ENHANCEMENT 1: Edition year normalization (-YYYY → eYYYY)
|
|
277
|
+
# Per NIST spec, trailing -YYYY should normalize to eYYYY format
|
|
278
|
+
# Pattern: number (optionally with non-e letter suffix) followed by dash and 4-digit year
|
|
279
|
+
# Examples: "330-2019" → "330e2019", "304a-2017" → "304Ae2017"
|
|
280
|
+
# Must NOT match existing edition patterns like "11e2-1915" (e2 is edition, -1915 is separate)
|
|
281
|
+
# Must be at end or before space to avoid breaking number-number patterns like "800-53"
|
|
282
|
+
# Negative lookbehind (?<![eE-]) prevents matching after e/E or dash (avoids e2-1915 and 105-1-1990)
|
|
283
|
+
# EXCLUSION: Do NOT convert -YYYY for HB series (handbooks) - preserve original format
|
|
284
|
+
# Example: "NBS HB 130-1979" should stay as "NBS HB 130-1979" (not convert to e1979)
|
|
285
|
+
# EXCLUSION: Do NOT convert -YYYY when preceded by "e\d+" (edition+year pattern like "44e2-1955")
|
|
286
|
+
# EXCLUSION: Only convert years in NBS (1901-1988) or NIST (1988-2099) range
|
|
287
|
+
# Numbers outside this range are part numbers, not edition years (e.g., SP 250-1039)
|
|
288
|
+
# Use a more specific pattern: only convert when NOT preceded by "e" + digits (edition)
|
|
289
|
+
# AND only convert when year is in valid range (1901-2099)
|
|
290
|
+
cleaned = cleaned.gsub(/(?<!e\d)(?<![eE-])(\d(?:[A-DF-Z]?))-(\d{4})(?=\s|$)/) do |match|
|
|
291
|
+
prefix = $1 # Number with optional letter
|
|
292
|
+
year = $2.to_i
|
|
293
|
+
# Only convert to edition format if year is in valid range
|
|
294
|
+
if year.between?(1901, 2099)
|
|
295
|
+
"#{prefix}e#{year}"
|
|
296
|
+
else
|
|
297
|
+
match # Keep dash format for part numbers (e.g., 250-1039)
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
# Revert the conversion for HB series to preserve -YYYY format
|
|
301
|
+
# Matches both "HB 130e1979" and "HB 105-1e1990" patterns
|
|
302
|
+
# Use [^:\s.]*? (exclude dots) to avoid consuming MR format dot separators
|
|
303
|
+
# This prevents "NIST.HB.135e2022" from being incorrectly reverted
|
|
304
|
+
cleaned = cleaned.gsub(/\b(HB|HB\s+)[^:\s.]*?(\d+)e(\d{4})(?=\s|$)/,
|
|
305
|
+
'\1\2-\3')
|
|
306
|
+
# Revert the conversion for OWMWP series to preserve date format MM-DD-YYYY
|
|
307
|
+
# OWMWP uses date as the number: "06-13-2018" (not an edition)
|
|
308
|
+
# Pattern: "OWMWP 06-13e2018" → "OWMWP 06-13-2018"
|
|
309
|
+
cleaned = cleaned.gsub(
|
|
310
|
+
/\b(OWMWP|OWMWP\s*)[^:\s]*?(\d{2})-(\d{2})e(\d{4})(?=\s|$)/, '\1\2-\3-\4'
|
|
311
|
+
)
|
|
312
|
+
# Revert the conversion for RPT series to preserve year range format YYYY-YYYY
|
|
313
|
+
# Report series uses year ranges as the number: "1946-1947" (not an edition)
|
|
314
|
+
# Pattern: "RPT 1946e1947" → "RPT 1946-1947"
|
|
315
|
+
# Note: This must check that first year < second year (forward range)
|
|
316
|
+
cleaned = cleaned.gsub(/\b(RPT|RPT\s*)([^:\s]*?)(\d{4})e(\d{4})(?=\s|$)/) do |match|
|
|
317
|
+
prefix = $1 # "RPT" or "RPT "
|
|
318
|
+
separator = $2 # "." or "" or other non-colon, non-space chars
|
|
319
|
+
first_year = $3.to_i
|
|
320
|
+
second_year = $4.to_i
|
|
321
|
+
# Only revert if first < second (year range like 1946-1947)
|
|
322
|
+
if first_year < second_year
|
|
323
|
+
"#{prefix}#{separator}#{first_year}-#{second_year}"
|
|
324
|
+
else
|
|
325
|
+
match # Keep e format for editions like e2018e2019
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# ENHANCEMENT 2: Version normalization (v1.1 → ver1.1, Ver. 2.0 → ver2.0)
|
|
330
|
+
# Normalize short v format to verbose ver format per NIST spec
|
|
331
|
+
# Already handled in version rule, but normalize in preprocessing for consistency
|
|
332
|
+
|
|
333
|
+
# CRITICAL: MR format version normalization must come BEFORE general v normalization
|
|
334
|
+
# Pattern: "NIST.SP.500-281-v1.0" → "NIST.SP.500-281.ver1.0"
|
|
335
|
+
# This allows report_number to match "500-281" and version rule to match ".ver1.0"
|
|
336
|
+
cleaned = cleaned.gsub(/-v(\d+\.\d+)/, '.ver\1')
|
|
337
|
+
|
|
338
|
+
# Handle Ver. with period: "Ver. 2.0" → "ver2.0" (remove period and space)
|
|
339
|
+
cleaned = cleaned.gsub(/\bVer\.\s+(\d+(?:\.\d+)*)/, 'ver\1')
|
|
340
|
+
# Handle verbose "v" to "ver": "v1.1" → "ver1.1" (only with dots - versions have dots)
|
|
341
|
+
cleaned = cleaned.gsub(/\bv(\d+\.\d+(?:\.\d+)*)/, 'ver\1')
|
|
342
|
+
|
|
343
|
+
# Fix uppercase P for part: "428P1" → "428 p1", "647P2" → "647 p2" (NEW)
|
|
344
|
+
cleaned = cleaned.gsub(/(\d)P(\d)/, '\1 p\2')
|
|
345
|
+
|
|
346
|
+
# Normalize part notation: "p1" → "pt1", "n1" → "pt1" for consistency
|
|
347
|
+
# This handles patterns like "61p1" → "61pt1" and "467n1" → "467pt1"
|
|
348
|
+
# MUST come AFTER uppercase P normalization
|
|
349
|
+
# EXCLUDE pattern: {number}p{digit}{4-digit-year} like "28p11969" (part + year, not part notation)
|
|
350
|
+
# Use negative lookahead to avoid matching when p/n + digit is followed by exactly 4 digits (year)
|
|
351
|
+
cleaned = cleaned.gsub(/\b([pn])(\d+)(?!\d{4}\b)/, 'pt\2')
|
|
352
|
+
|
|
353
|
+
# Fix complex part patterns in MR format: ensure space before part
|
|
354
|
+
cleaned = cleaned.gsub(/(\d)([pP]\d+)/, '\1 \2') # .467p1adde1 → .467 p1adde1, 800-57p1 → 800-57 p1
|
|
355
|
+
|
|
356
|
+
# Fix CRPL-F series: ensure space after series (e.g., "CRPL-F-B150" → "CRPL-F-B 150")
|
|
357
|
+
cleaned = cleaned.gsub(/(NBS CRPL-F-[AB])(\d)/, '\1 \2')
|
|
358
|
+
cleaned = cleaned.gsub(/(CRPL-F-[AB])(\d)/, '\1 \2')
|
|
359
|
+
|
|
360
|
+
# Extract volume from number: "17-917v3" → "17-917 v3", "1-1v1" → "1-1 v1"
|
|
361
|
+
# Pattern: digits-digits followed by v and digits (GCR, NCSTAR patterns)
|
|
362
|
+
# MUST be specific to avoid breaking existing "v1.1" patterns
|
|
363
|
+
cleaned = cleaned.gsub(/(\d+-\d+)(v\d+)(?![.\d])/, '\1 \2') # Negative lookahead for dots
|
|
364
|
+
|
|
365
|
+
# pd_suffix rule handles " 2pd" directly (space >> digits >> str("pd"))
|
|
366
|
+
# No preprocessing needed - adding space before "pd" breaks the parser
|
|
367
|
+
|
|
368
|
+
# Fix "Suppl" with space: "955 Suppl" → "955Suppl"
|
|
369
|
+
cleaned = cleaned.gsub(/(\d+)\s+Suppl\b/, '\1Suppl')
|
|
370
|
+
|
|
371
|
+
# Fix verbose "Version" format: " Version 2" → " ver 2"
|
|
372
|
+
cleaned = cleaned.gsub(/\s+Version\s+(\d+)/, ' ver \1')
|
|
373
|
+
|
|
374
|
+
# Fix verbose "Revision" format: " Revision (r)" → " r"
|
|
375
|
+
cleaned = cleaned.gsub(/\s+Revision\s+\(r\)/, " r")
|
|
376
|
+
|
|
377
|
+
# Fix verbose "Part N" → short "ptN": "800-57 Part 2 Rev. 1" →
|
|
378
|
+
# "800-57pt2 Rev. 1". The grammar already accepts short "ptN" (and
|
|
379
|
+
# "ptN Rev. M"); only the verbose spelling was unsupported. Attaches
|
|
380
|
+
# to the preceding number so the existing part rule applies.
|
|
381
|
+
cleaned = cleaned.gsub(/\s+Part\s+(\d+)/, 'pt\1')
|
|
382
|
+
|
|
383
|
+
# Normalize verbose addendum " Add"/" add" (with or without period)
|
|
384
|
+
# to the canonical " Add." the grammar accepts, and uppercase a
|
|
385
|
+
# doc-number letter that immediately precedes it ("800-38a Add" →
|
|
386
|
+
# "800-38A Add.") — NIST doc-number letters are canonically uppercase
|
|
387
|
+
# and the letter_number grammar rule only splits the uppercase form.
|
|
388
|
+
# Scoped to the addendum context so bare markers like "800-90r"
|
|
389
|
+
# (revision) are left untouched.
|
|
390
|
+
cleaned = cleaned.gsub(/(\d[a-z]?)\s+Add\b\.?/i) { "#{Regexp.last_match(1).upcase} Add." }
|
|
391
|
+
|
|
392
|
+
# Fix verbose "rev YYYY" format: "126 rev 2013" → "126r2013"
|
|
393
|
+
# Removes space between number and "rev", and converts to "r" prefix
|
|
394
|
+
# Handles patterns like "NIST SP 260-126 rev 2013" → "NIST SP 260-126r2013"
|
|
395
|
+
cleaned = cleaned.gsub(/(\d+)\s+rev\s+(\d{4})/, '\1r\2')
|
|
396
|
+
|
|
397
|
+
# Fix historical "report ;" format: "NBS report ; 8079" → "NBS RPT 8079"
|
|
398
|
+
# The semicolon and "report" (spelled out) are historical formats
|
|
399
|
+
cleaned = cleaned.gsub(/\breport\s*;\s*/, "RPT ")
|
|
400
|
+
cleaned = cleaned.gsub(/\breport\b/, "RPT")
|
|
401
|
+
|
|
402
|
+
# REMOVED: Incorrect dot preprocessing that treated dots as number separators
|
|
403
|
+
# This was semantically wrong - dots are PART separators in NIST!
|
|
404
|
+
# DELETE: cleaned = cleaned.gsub(/(\d{3,})\.(\d{1,4})(?=\s|$)/, '\1_\2')
|
|
405
|
+
|
|
406
|
+
# REMOVED: Incorrect space-to-underscore that treated as single number
|
|
407
|
+
# DELETE: cleaned = cleaned.gsub(/(\d{3,})\s+(\d{1,2})$/, '\1_\2')
|
|
408
|
+
|
|
409
|
+
# Detect format before parsing
|
|
410
|
+
format = detect_format(input.to_s)
|
|
411
|
+
|
|
412
|
+
# Use parslet parser instance
|
|
413
|
+
result = new.parse(cleaned)
|
|
414
|
+
|
|
415
|
+
# Add format to result
|
|
416
|
+
if result.is_a?(Hash)
|
|
417
|
+
result.merge(parsed_format: format)
|
|
418
|
+
elsif result.is_a?(Array)
|
|
419
|
+
# For array results, merge all hashes into one
|
|
420
|
+
# This handles cases where identifier rule returns multiple components (e.g., compound_series + edition)
|
|
421
|
+
merged = result.inject({}) do |acc, hash|
|
|
422
|
+
next acc unless hash.is_a?(Hash)
|
|
423
|
+
|
|
424
|
+
acc.merge(hash)
|
|
425
|
+
end
|
|
426
|
+
merged.merge(parsed_format: format)
|
|
427
|
+
else
|
|
428
|
+
result
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# Detect format from input string
|
|
433
|
+
# :mr if contains dots (machine-readable: NIST.SP.800-53)
|
|
434
|
+
# :short otherwise (default: NIST SP 800-53)
|
|
435
|
+
def self.detect_format(input)
|
|
436
|
+
# Check if it has dot separators (MR format pattern)
|
|
437
|
+
# Patterns include:
|
|
438
|
+
# - "NIST.SP.800-53" (publisher.series.number)
|
|
439
|
+
# - "FIPS.46e1977" (series.numberWithEdition)
|
|
440
|
+
# - "NBS.HB.28pt1e1969" (publisher.series.part.edition)
|
|
441
|
+
# Key indicator: dots between components instead of spaces
|
|
442
|
+
if input.include?(".") && !input.match?(/\s/)
|
|
443
|
+
:mr
|
|
444
|
+
else
|
|
445
|
+
:short
|
|
446
|
+
end
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
# Convert Roman numerals to Arabic numbers
|
|
450
|
+
# I→1, II→2, III→3, IV→4, V→5, VI→6, VII→7, VIII→8, IX→9, X→10
|
|
451
|
+
def self.roman_to_arabic(roman)
|
|
452
|
+
case roman
|
|
453
|
+
when "I" then "1"
|
|
454
|
+
when "II" then "2"
|
|
455
|
+
when "III" then "3"
|
|
456
|
+
when "IV" then "4"
|
|
457
|
+
when "V" then "5"
|
|
458
|
+
when "VI" then "6"
|
|
459
|
+
when "VII" then "7"
|
|
460
|
+
when "VIII" then "8"
|
|
461
|
+
when "IX" then "9"
|
|
462
|
+
when "X" then "10"
|
|
463
|
+
else roman # Fallback for unexpected patterns
|
|
464
|
+
end
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
# Basic building blocks
|
|
468
|
+
rule(:space) { str(" ") }
|
|
469
|
+
rule(:dot) { str(".") }
|
|
470
|
+
rule(:dash) { str("-") }
|
|
471
|
+
rule(:slash) { str("/") }
|
|
472
|
+
rule(:digit) { match("[0-9]") }
|
|
473
|
+
rule(:digits) { digit.repeat(1) }
|
|
474
|
+
rule(:letter) { match("[A-Za-z]") }
|
|
475
|
+
rule(:upper_letter) { match("[A-Z]") }
|
|
476
|
+
rule(:lower_letter) { match("[a-z]") }
|
|
477
|
+
|
|
478
|
+
# Hash prefix for machine-readable formats
|
|
479
|
+
rule(:hash_prefix) { str("#") }
|
|
480
|
+
|
|
481
|
+
# Month abbreviations
|
|
482
|
+
rule(:month_abbrev) do
|
|
483
|
+
str("January") | str("February") | str("March") | str("April") |
|
|
484
|
+
str("May") | str("June") | str("July") | str("August") |
|
|
485
|
+
str("September") | str("October") | str("November") | str("December") |
|
|
486
|
+
str("Jan") | str("Feb") | str("Mar") | str("Apr") |
|
|
487
|
+
str("Jun") | str("Jul") | str("Aug") | str("Sep") | str("Oct") | str("Nov") | str("Dec")
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
# Language codes for translations - 2-4 letter codes
|
|
491
|
+
# Supports: " spa", "(spa)", ".spa" (MR format)
|
|
492
|
+
rule(:language_code) do
|
|
493
|
+
((space | dot).maybe >> (str("es") | str("pt") | str("chi") | str("viet") | str("port") | str("esp") |
|
|
494
|
+
match("[a-z]").repeat(2, 4))).as(:translation)
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
# Stage ID: i (initial), f (final), 1-9 (numbered iterations)
|
|
498
|
+
rule(:stage_id) do
|
|
499
|
+
str("i") | str("I") | str("f") | str("F") |
|
|
500
|
+
str("1") | str("2") | str("3") | str("4") | str("5") |
|
|
501
|
+
str("6") | str("7") | str("8") | str("9")
|
|
502
|
+
end
|
|
503
|
+
|
|
504
|
+
# Stage type: pd (public draft), wd (work-in-progress), prd (preliminary)
|
|
505
|
+
rule(:stage_type) do
|
|
506
|
+
str("pd") | str("PD") | str("wd") | str("WD") | str("prd") | str("PRD")
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
# Old style stage: (IPD), (FPD), (2PD) - parenthetical at document start
|
|
510
|
+
rule(:old_stage) do
|
|
511
|
+
str("(") >> (stage_id.as(:stage_id) >> stage_type.as(:stage_type)).as(:stage) >> str(")")
|
|
512
|
+
end
|
|
513
|
+
|
|
514
|
+
# New style stage: " ipd", ".ipd" - inline at document end
|
|
515
|
+
rule(:new_stage) do
|
|
516
|
+
(space | dot) >> (stage_id.as(:stage_id) >> stage_type.as(:stage_type)).as(:stage)
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
# Publisher
|
|
520
|
+
rule(:publisher) do
|
|
521
|
+
(str("NBS") | str("NIST")).as(:publisher)
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
# Compound series (include publisher in series name) - must be checked FIRST
|
|
525
|
+
rule(:compound_series) do
|
|
526
|
+
(
|
|
527
|
+
# Longest patterns first to avoid partial matches
|
|
528
|
+
str("NBS BRPD-CRPL-D") | str("NBS CRPL-F-A") | str("NBS CRPL-F-B") |
|
|
529
|
+
str("NBS CS-E") | str("CSRC Building Block") | str("CSRC Use Case") | str("CSRC Book") |
|
|
530
|
+
str("ITL Bulletin") | str("NSRDS-NBS") |
|
|
531
|
+
# NBS and NIST specific patterns that conflict with simple series
|
|
532
|
+
# CRITICAL: Put longer patterns before shorter to avoid partial matches!
|
|
533
|
+
str("NIST LCIRC") | str("NBS LCIRC") | str("NIST.LCIRC") | str("NBS.LCIRC") | str("NBS RPT") |
|
|
534
|
+
str("NIST PS") | str("NIST DCI") | str("NIST Other") |
|
|
535
|
+
str("NISTPUB") |
|
|
536
|
+
str("NBS CSM") | str("NBS CIRC") | str("NBS.CRPL") | str("NBS CRPL") | str("NBS CS") |
|
|
537
|
+
str("NBS CIS") | str("NBS HR") | str("NBS IRPL") | str("NBS IP") | str("NBS PS") |
|
|
538
|
+
str("NBS BH")
|
|
539
|
+
).as(:series)
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
# Simple series (no publisher prefix)
|
|
543
|
+
rule(:simple_series) do
|
|
544
|
+
(
|
|
545
|
+
str("AMS") | str("VTS") | # NEW - Added for NIST AMS and VTS series
|
|
546
|
+
str("BSS") | str("BMS") | str("BH") |
|
|
547
|
+
str("FIPS") | str("GCR") | str("HB") | str("MONO") |
|
|
548
|
+
str("MP") | str("NCSTAR") | str("NSRDS") | str("IR") |
|
|
549
|
+
str("SP") | str("TN") | str("CSWP") |
|
|
550
|
+
str("AI") | str("CIRC") | str("CS") | str("CSM") |
|
|
551
|
+
str("CRPL") | str("LCIRC") | str("OWMWP") | str("PC") | str("RPT") |
|
|
552
|
+
str("SIBS") | str("TIBM") | str("TTB") | str("EAB") |
|
|
553
|
+
str("JPCRD") | str("JRES")
|
|
554
|
+
).as(:series)
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
# Suffix letter(s) after number - supports single letters and specific two-letter suffixes
|
|
558
|
+
# Two-letter suffixes: Ur (Unclassified Revised), Ua (Unclassified Amended), Ub-Uj (series variants)
|
|
559
|
+
# Single letter: any letter not followed by excluded keywords
|
|
560
|
+
rule(:number_suffix) do
|
|
561
|
+
(str("U") >> lower_letter) | (match("[a-zA-Z]") >> (
|
|
562
|
+
# Match suffixes
|
|
563
|
+
str("ec") |
|
|
564
|
+
str("ndex") |
|
|
565
|
+
str("nsert") |
|
|
566
|
+
str("rrata") |
|
|
567
|
+
str("raft") | # NEW: Exclude "draft" from number suffix matching
|
|
568
|
+
str("pp") |
|
|
569
|
+
str("s") |
|
|
570
|
+
str("t") |
|
|
571
|
+
str("hi") |
|
|
572
|
+
str("iet") |
|
|
573
|
+
str("ort") |
|
|
574
|
+
str("r") | # NEW: Exclude "r" revision marker (e.g., r5, r1963)
|
|
575
|
+
str("p") # NEW: Exclude "p" part marker (e.g., 28p11969 - part with year pattern)
|
|
576
|
+
).absent? >>
|
|
577
|
+
digits.maybe)
|
|
578
|
+
end
|
|
579
|
+
|
|
580
|
+
rule(:digits_with_suffix) do
|
|
581
|
+
digits >>
|
|
582
|
+
# Suffix only if not followed by digit (e.g., don't match 'e' in '140e2')
|
|
583
|
+
(number_suffix >> digit.absent?).maybe
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
# Report number - first part - support edition prefixes like "e104" and supplement suffixes like "144supp"
|
|
587
|
+
# Supplements should be handled as separate parts
|
|
588
|
+
rule(:first_number) do
|
|
589
|
+
(
|
|
590
|
+
# OWMWP date format: MM-DD-YYYY (e.g., 06-13-2018)
|
|
591
|
+
# Must be FIRST to match before other dash patterns
|
|
592
|
+
(match("[0-9]").repeat(2, 2).as(:owmwp_month) >> dash >>
|
|
593
|
+
match("[0-9]").repeat(2, 2).as(:owmwp_day) >> dash >>
|
|
594
|
+
match("[0-9]").repeat(4, 4).as(:owmwp_year)).as(:owmwp_date_number) |
|
|
595
|
+
# Special text patterns - MOST SPECIFIC FIRST (NEW for RPT patterns)
|
|
596
|
+
str("ADHOC") | (str("div") >> digits) |
|
|
597
|
+
# Month ranges for RPT: Apr-Jun1948 (NEW)
|
|
598
|
+
(month_abbrev >> dash >> month_abbrev >> digits) |
|
|
599
|
+
# Number with volume suffix (e.g., "539v10" for CIRC, "1011v1" for general patterns)
|
|
600
|
+
# CRITICAL: Must be before CS series pattern to avoid consuming "GB" as letter suffix
|
|
601
|
+
(digits.as(:number) >> str("v") >> digits.as(:volume_suffix)).as(:number_with_volume) |
|
|
602
|
+
# Roman numeral patterns: 1011-I-2.0, 1011-II-1.0 (ENHANCED to accept optional dots)
|
|
603
|
+
(digits >> dash >> (str("III") | str("II") | str("IV") | str("I") | str("V") | str("VI") | str("VII") | str("VIII") | str("IX") | str("X")) >> dash >> digits >> (dot >> digits).maybe) |
|
|
604
|
+
# GB series pattern: 1190GB-1, 1190GB-4A
|
|
605
|
+
(digits >> str("GB") >> dash >> digits >> upper_letter.maybe) |
|
|
606
|
+
# CS series pattern with letter in middle: 102E-42, 123A-50
|
|
607
|
+
(digits >> upper_letter >> dash >> digits) |
|
|
608
|
+
# Volume-number format for CSM series: v6n1, v7n12
|
|
609
|
+
# CHANGED: Capture volume and issue_number separately for proper semantics
|
|
610
|
+
(str("v") >> digits.as(:volume_number) >> str("n") >> digits.as(:issue_number)) |
|
|
611
|
+
# Regular number with supplement and revision suffix: "154supprev"
|
|
612
|
+
(digits >> str("supprev")) |
|
|
613
|
+
# Regular number with edition and revision year-only: "13e2rev1908"
|
|
614
|
+
(digits >> str("e") >> digits >> str("rev") >> digits) |
|
|
615
|
+
# NEW: Number with revision year (rv pattern for LetterCircular): "1013rv1953"
|
|
616
|
+
(digits.as(:number) >> str("rv") >> digits.as(:revision_year)).as(:number_with_rev_year) |
|
|
617
|
+
# Regular number with edition, revision, and month-date: "13e2revJune1908"
|
|
618
|
+
(digits >> str("e") >> digits >> str("rev") >> month_abbrev >> digits) |
|
|
619
|
+
# Regular number with eN suffix and optional supplement (e.g., "101e2supp") - most specific
|
|
620
|
+
(digits >> str("e") >> digits >> str("supp") >> digits.maybe) |
|
|
621
|
+
# Edition prefix with revision and date: e2revJune1908
|
|
622
|
+
(str("e") >> digits >> str("rev") >> month_abbrev >> digits) |
|
|
623
|
+
# Edition prefix followed by digits and optional supplement with digits
|
|
624
|
+
(str("e") >> digits >> str("supp") >> digits.maybe) |
|
|
625
|
+
# Regular number with eN suffix (e.g., "101e2")
|
|
626
|
+
(digits >> str("e") >> digits) |
|
|
627
|
+
# NEW: Bare edition (just "e2" without number prefix)
|
|
628
|
+
(str("e") >> digits >> (dash >> digits).absent?) |
|
|
629
|
+
# Letter prefix with digits (e.g., "c4" for CRPL)
|
|
630
|
+
(lower_letter >> digits) |
|
|
631
|
+
# Regular number with supplement suffix with month/year (e.g., "24suppJan1924")
|
|
632
|
+
(digits >> str("supp") >> month_abbrev >> digits) |
|
|
633
|
+
# Regular number with supplement suffix (e.g., "144supp") - with optional digits
|
|
634
|
+
(digits >> str("supp") >> digits.maybe) |
|
|
635
|
+
# Regular number with supplement suffix followed by month/year for date range
|
|
636
|
+
(digits >> str("sup") >> month_abbrev >> digits) |
|
|
637
|
+
# Regular number with "sup" suffix (e.g., "9350sup") - NEW for RPT patterns
|
|
638
|
+
(digits >> str("sup")) |
|
|
639
|
+
# Language code suffix without separator (e.g., "1088sp")
|
|
640
|
+
# Must come BEFORE general suffix pattern to capture specific language codes
|
|
641
|
+
# Must come AFTER other patterns (like sup, supp, etc.) to avoid consuming them
|
|
642
|
+
# Note: Preprocessing doesn't convert attached suffixes, so we handle both cases
|
|
643
|
+
(digits.as(:number) >> (str("sp") | str("pt") | str("es") | str("SP") | str("PT") | str("ES")).as(:language_code) >> (upper_letter.absent? >> digit.absent? >> letter.absent? >> dash.absent? >> dot.absent?)) |
|
|
644
|
+
# Part+edition suffix for MR format: "28pt1e1969" (part notation + edition year)
|
|
645
|
+
# Handles patterns like "NBS.HB.28pt1e1969" where part and edition are attached
|
|
646
|
+
# Must come BEFORE language code pattern to take priority
|
|
647
|
+
(digits.as(:number) >> str("pt") >> digits.as(:part_number) >> str("e") >> digits.as(:edition_year)) |
|
|
648
|
+
# Parenthetical language code (e.g., "378(sp)")
|
|
649
|
+
# Must come AFTER other patterns to avoid consuming letter suffixes
|
|
650
|
+
# Note: Preprocessing converts content inside parentheses to uppercase
|
|
651
|
+
# Use specific patterns to avoid consuming other parenthetical content
|
|
652
|
+
(digits.as(:number) >> str("(") >> (str("SP") | str("PT") | str("ES")).as(:language_code) >> str(")")) |
|
|
653
|
+
# Regular number with optional suffix (original) - includes letters like "A"
|
|
654
|
+
digits_with_suffix
|
|
655
|
+
).as(:first_number)
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
# Second number (after dash) - allow pt suffix, letter suffixes, and CRPL patterns
|
|
659
|
+
rule(:second_number) do
|
|
660
|
+
# Explicitly exclude month abbreviations at start (so -Feb1985 goes to edition, not second_number)
|
|
661
|
+
month_abbrev.absent? >>
|
|
662
|
+
# NEW: Exclude "draft" keyword
|
|
663
|
+
str("draft").absent? >>
|
|
664
|
+
(
|
|
665
|
+
# Trailing bare supplement marker on a compound second number
|
|
666
|
+
# (e.g. "800-53sup") so it isn't split into "53s" + "up". Builder
|
|
667
|
+
# strips the marker and sets supplement="" (canonical "sup").
|
|
668
|
+
(digits >> (str("supp") | str("sup")) >>
|
|
669
|
+
(digit.absent? >> letter.absent?)) |
|
|
670
|
+
# NEW: Revision pattern with U+letter suffix (e.g., "22r1Ua", "38Ua")
|
|
671
|
+
# MUST come BEFORE general letter suffix to avoid matching just "U" from "Ua"
|
|
672
|
+
(digits >> str("r") >> digits >> str("U") >> lower_letter) |
|
|
673
|
+
# NEW: Revision pattern with letter suffix (e.g., "22r1a", "22r1A" for SP patterns)
|
|
674
|
+
# This allows second_number to match the entire "22r1A" as a single unit
|
|
675
|
+
# MUST come BEFORE plain r+digits to avoid greedy match of just "22r1"
|
|
676
|
+
(digits >> str("r") >> digits >> match("[a-zA-Z]")) |
|
|
677
|
+
# NEW: Revision pattern with year (e.g., "126r2013")
|
|
678
|
+
# This handles SP revision format where revision is attached to second_number
|
|
679
|
+
(digits.as(:number_only) >> str("r") >> digits.as(:edition_id)) |
|
|
680
|
+
# CRPL range with underscore (e.g., "2_3-1A")
|
|
681
|
+
(digits >> str("_") >> digits >> dash >> digits >> upper_letter.maybe) |
|
|
682
|
+
# Letter followed by dash and digits (e.g., "m-5")
|
|
683
|
+
(lower_letter >> dash >> digits) |
|
|
684
|
+
# Number with pt suffix (e.g., "57pt1")
|
|
685
|
+
# EXCLUDE pt#-# patterns (e.g., "pt3-1") which are part components for CRPL
|
|
686
|
+
# Use negative lookahead to prevent matching when followed by dash
|
|
687
|
+
(digits >> str("pt") >> digits >> dash.absent?) |
|
|
688
|
+
# Number with uppercase letter suffix (e.g., "56A", "123B") - for patterns like "56Ar2"
|
|
689
|
+
(digits >> upper_letter) |
|
|
690
|
+
# NEW: Revision pattern where r is directly followed by a letter (e.g., "27ra" -> rA)
|
|
691
|
+
# For patterns like NIST SP 800-27ra where revision 'ra' is attached directly to number
|
|
692
|
+
(digits.as(:number_only) >> str("r") >> match("[a-zA-Z]").as(:letter)).as(:revision_letter) |
|
|
693
|
+
# NEW: Revision pattern where r is directly followed by a letter without leading digits (e.g., "rA")
|
|
694
|
+
# For patterns like NIST SP 800-27ra where revision 'ra' is attached directly to number
|
|
695
|
+
(str("r") >> match("[a-zA-Z]")).as(:revision_letter_suffix) |
|
|
696
|
+
# NEW: Simple revision pattern r followed by digits (e.g., "r1", "r2") for trailing revision
|
|
697
|
+
(str("r") >> digits.as(:edition_id)).as(:revision_simple) |
|
|
698
|
+
# Special patterns like "NCNR", "PERMIS", "BFRL"
|
|
699
|
+
str("NCNR") | str("PERMIS") | str("BFRL") |
|
|
700
|
+
# Just capital letters (e.g., "A", "B", "C") - standalone
|
|
701
|
+
upper_letter.repeat(1, 3) |
|
|
702
|
+
# Regular number with optional suffix - but NOT if part of FIPS date (digit-dash-month-digit-slash)
|
|
703
|
+
(digits_with_suffix >> (dash >> month_abbrev >> digits >> slash).absent?) |
|
|
704
|
+
# Single lowercase letter (e.g., "a", "b") - but NOT "r" followed by digits (edition marker)
|
|
705
|
+
# This is for patterns like "126a" but not "126r2"
|
|
706
|
+
(lower_letter >> digit.absent?)
|
|
707
|
+
).as(:second_number)
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
# Edition component per NIST spec: <edition-type><edition-id>
|
|
711
|
+
# Type: "e" (edition), "r" (revision), "rev" (revision verbose), "-" (historical)
|
|
712
|
+
# ID: number (1-9) or year (yyyy)
|
|
713
|
+
# Examples: e2, e2021, r5, rev2013, rev 2013, -3
|
|
714
|
+
# Enhanced: Support space-separated format from preprocessing (r1 separated from number)
|
|
715
|
+
rule(:edition) do
|
|
716
|
+
# Edition with "e" prefix: e2, e3, e2021 (1-4 digits for ID)
|
|
717
|
+
(space.maybe >> str("e") >> digits.as(:edition_id)).as(:edition_e) |
|
|
718
|
+
# Revision with "r" prefix and SPACE, with letter: r 5A (preserve format)
|
|
719
|
+
(space >> str("r") >> digits.as(:edition_id) >> match("[a-zA-Z]").as(:edition_letter)).as(:edition_r_with_space_letter) |
|
|
720
|
+
# Revision with "r" prefix and SPACE: r 5 (preserve format)
|
|
721
|
+
(space >> str("r") >> digits.as(:edition_id)).as(:edition_r_with_space) |
|
|
722
|
+
# Revision with "r" prefix NO space, with letter: r5A (compact format)
|
|
723
|
+
(str("r") >> digits.as(:edition_id) >> match("[a-zA-Z]").as(:edition_letter)).as(:edition_r_no_space_letter) |
|
|
724
|
+
# Revision with "r" prefix NO space: r5 (compact format)
|
|
725
|
+
(str("r") >> digits.as(:edition_id)).as(:edition_r_no_space) |
|
|
726
|
+
# Revision with "rev" prefix (verbose): rev2013, rev 2013
|
|
727
|
+
(space.maybe >> str("rev") >> space.maybe >> digits.as(:edition_id)).as(:edition_rev) |
|
|
728
|
+
# Historical with "-" prefix: -2, -3 (ONLY if followed by non-digit or end)
|
|
729
|
+
# This avoids consuming date patterns like "-1908"
|
|
730
|
+
# Historical precedent uses small numbers (1-9), dates use 4-digit years
|
|
731
|
+
(dash >> match("[1-9]").as(:edition_id) >> digit.absent?).as(:edition_historical) |
|
|
732
|
+
# Edition dash-year pattern: -1979, -1990 (dash + 4-digit year)
|
|
733
|
+
# This matches year-only editions like "NBS HB 130-1979"
|
|
734
|
+
(dash >> match("[0-9]").repeat(4,
|
|
735
|
+
4).as(:dash_year)).as(:edition_dash_year)
|
|
736
|
+
end
|
|
737
|
+
|
|
738
|
+
# Date component per NIST spec: -{YYYY} or -{YYYYMM} or -{YYYYMMDD}
|
|
739
|
+
# Separate from Edition - both can coexist
|
|
740
|
+
# Examples: -1908, -190806, -19770930
|
|
741
|
+
rule(:date) do
|
|
742
|
+
(
|
|
743
|
+
# Date with month and day: -19770930 (YYYYMMDD)
|
|
744
|
+
(dash >> match("[0-9]").repeat(4, 4).as(:date_year) >>
|
|
745
|
+
match("[0-9]").repeat(2, 2).as(:date_month) >>
|
|
746
|
+
match("[0-9]").repeat(2, 2).as(:date_day)) |
|
|
747
|
+
# Date with month: -190806 (YYYYMM)
|
|
748
|
+
(dash >> match("[0-9]").repeat(4, 4).as(:date_year) >>
|
|
749
|
+
match("[0-9]").repeat(2, 2).as(:date_month)) |
|
|
750
|
+
# Date with year only: -1908 (YYYY)
|
|
751
|
+
(dash >> match("[0-9]").repeat(4, 4).as(:date_year)) |
|
|
752
|
+
# Legacy month format: -June1908, -Jan1925 (normalize to YYYYMM)
|
|
753
|
+
(dash >> month_abbrev.as(:date_month) >> digits.as(:date_year))
|
|
754
|
+
).as(:date)
|
|
755
|
+
end
|
|
756
|
+
|
|
757
|
+
# LEGACY EDITION PATTERNS (for backward compatibility during migration)
|
|
758
|
+
# These will be gradually replaced as we migrate to proper Edition/Date components
|
|
759
|
+
rule(:legacy_edition) do
|
|
760
|
+
# Complex revision patterns: r1a, r2b
|
|
761
|
+
((str("r") | str(" R")) >> match("[0-9]").repeat(1,
|
|
762
|
+
2).as(:edition) >> lower_letter.as(:edition_letter)) |
|
|
763
|
+
# Edition with revision and year: rev2013, rev2020, rev 2013 (with space)
|
|
764
|
+
(str("rev") >> space.maybe >> digits.as(:edition_year)) |
|
|
765
|
+
# Edition with revision and date: e2revJune1908 (will migrate to e2 + date)
|
|
766
|
+
((str("e") | str(" E")) >> match("[0-9]").repeat(1, 3).as(:edition) >>
|
|
767
|
+
str("rev") >> match("[A-Za-z]").repeat(3,
|
|
768
|
+
9).as(:edition_month) >> digits.as(:edition_year)) |
|
|
769
|
+
# Edition with year and month: e201801 (ambiguous - could be e2018 or year 2018 month 01)
|
|
770
|
+
(str("e") >> match("[0-9]").repeat(4,
|
|
771
|
+
4).as(:edition_year) >> match("[0-9]").repeat(
|
|
772
|
+
2, 2
|
|
773
|
+
).as(:edition_month).maybe) |
|
|
774
|
+
# Revision-based edition: revJune1908, revJan1925 (normalize to date)
|
|
775
|
+
(str("rev") >> match("[A-Za-z]").repeat(3,
|
|
776
|
+
9).as(:edition_month) >> digits.as(:edition_year))
|
|
777
|
+
end
|
|
778
|
+
|
|
779
|
+
# CRPL range pattern (e.g., 1-2_3-1, 1-2_3-1A with suffix) - matches after first dash
|
|
780
|
+
rule(:crpl_range) do
|
|
781
|
+
(digits >> str("_") >> digits >> dash >> digits >> upper_letter.maybe).as(:crpl_range)
|
|
782
|
+
end
|
|
783
|
+
|
|
784
|
+
# Full report number - support dot-separated parts AND CRPL ranges
|
|
785
|
+
# ENHANCED: Support multiple dashes for GCR patterns (Session 220)
|
|
786
|
+
# FIXED: Put GCR pattern first to prioritize matching full dash-separated patterns
|
|
787
|
+
# FIXED: Add edition.maybe to support revision patterns like 800-53r5 in short format
|
|
788
|
+
# FIXED: Month abbreviation as edition (e.g., 107-Mar1985, 11-Jan1925)
|
|
789
|
+
# FIXED: FIPS date format with day and slash (e.g., 11-1-Sep30/1977)
|
|
790
|
+
rule(:report_number) do
|
|
791
|
+
first_number >>
|
|
792
|
+
(
|
|
793
|
+
# Month abbreviation as edition (e.g., 107-Mar1985, 11-Jan1925)
|
|
794
|
+
# MUST BE FIRST to catch -MonthYear patterns before they're
|
|
795
|
+
# incorrectly parsed as other alternatives
|
|
796
|
+
(dash >> month_abbrev.as(:edition_month) >> digits.as(:edition_year)) |
|
|
797
|
+
# FIPS date format: -1-Sep30/1977 (part-month-day/year with slash)
|
|
798
|
+
# Must come before GCR pattern to avoid being matched as multi-dash
|
|
799
|
+
(dash >> digits.as(:fips_part) >> dash >> month_abbrev.as(:edition_month) >>
|
|
800
|
+
digits.as(:edition_day) >> slash >> digits.as(:edition_year)) |
|
|
801
|
+
# Dash with decimal suffix (e.g., 80-2073.3, 123-45.67)
|
|
802
|
+
# Must come before GCR pattern which expects another dash after second_number
|
|
803
|
+
(dash >> digits.as(:decimal_base) >> dot >> digits.as(:decimal_suffix)).as(:decimal_number) |
|
|
804
|
+
# Dash with letter suffix (e.g., 1-1A, 1-3B for NCSTAR, 73-197Ur for IR)
|
|
805
|
+
# Must come before GCR pattern which expects another dash
|
|
806
|
+
# Supports U+lowercase letter suffix (e.g., Ur, Ua, Ub-Uj for Unclassified variants)
|
|
807
|
+
# For other uppercase letters, only match single letter (A, B, C) to avoid consuming revision r
|
|
808
|
+
(dash >> digits.as(:letter_base) >> (
|
|
809
|
+
(str("U") >> lower_letter.as(:letter_suffix_extra)) |
|
|
810
|
+
upper_letter
|
|
811
|
+
).as(:letter_suffix)).as(:letter_number) |
|
|
812
|
+
# Edition dash-year pattern (e.g., -1979 for handbooks like "NBS HB 130-1979")
|
|
813
|
+
# Matches any 4-digit sequence - the builder decides if it's a year or second_number
|
|
814
|
+
(dash >> match("[0-9]").repeat(4,
|
|
815
|
+
4).as(:dash_year) >> (space | dot | part | crpl_range | second_number | dash).absent?).as(:edition_dash_year) |
|
|
816
|
+
# Second number followed by edition dash-year (e.g., -1-1990 for "105-1-1990")
|
|
817
|
+
# Handles compound numbers with edition year at the end
|
|
818
|
+
# MUST be BEFORE GCR pattern because both start with dash + second_number + dash
|
|
819
|
+
(dash >> second_number >> dash >> match("[0-9]").repeat(4,
|
|
820
|
+
4).as(:dash_year) >> (space | dot | part | crpl_range | revision | draft).absent?).as(:second_number_edition_year) |
|
|
821
|
+
# FIPS month+year pattern after part (e.g., -1-Sep1977 for "11-1-Sep1977")
|
|
822
|
+
(dash >> second_number >> dash >> month_abbrev.as(:edition_month) >> digits.as(:edition_year) >> (space | dot | part | crpl_range | edition | revision | draft).absent?).as(:fips_month_year_after_part) |
|
|
823
|
+
# GCR multi-dash pattern (e.g., 85-3273-37, 19-200-30B)
|
|
824
|
+
(dash >> second_number >> dash >> (digits >> upper_letter.maybe).as(:part_number)) |
|
|
825
|
+
# Dot-separated part (e.g., 984.4 = number 984, part 4)
|
|
826
|
+
(dot >> second_number) |
|
|
827
|
+
# Dash-separated with optional revision (e.g., 800-53r5, 1019r1963)
|
|
828
|
+
(dash >> (crpl_range | second_number) >> edition.maybe) |
|
|
829
|
+
(dash >> edition)
|
|
830
|
+
# TODO: Language code suffix without separator (e.g., "1088sp")
|
|
831
|
+
# Must come AFTER other patterns to avoid consuming them
|
|
832
|
+
# | (str("sp") | str("pt") | str("es") | match("[a-z]").repeat(2, 4)).as(:language_code) >> (space | dot | part | crpl_range | second_number).absent?) |
|
|
833
|
+
# Parenthetical language code (e.g., "378(sp)")
|
|
834
|
+
# | (str("(") >> match("[a-z]").repeat(2, 4).as(:language_code) >> str(")") >> (space | dot | part | crpl_range | second_number).absent?)
|
|
835
|
+
).maybe
|
|
836
|
+
end
|
|
837
|
+
|
|
838
|
+
# Volume
|
|
839
|
+
rule(:volume) do
|
|
840
|
+
(space.maybe >> (str("v") | str(" Vol. "))) >>
|
|
841
|
+
(digits >>
|
|
842
|
+
# Support letter ranges (lowercase normalized in preprocessing)
|
|
843
|
+
(str("a-l") | str("m-z") | str("A-L") | str("M-Z")).maybe >>
|
|
844
|
+
# Support single uppercase letters (e.g., v3B, v1A)
|
|
845
|
+
upper_letter.repeat(0, 2)).as(:volume)
|
|
846
|
+
end
|
|
847
|
+
|
|
848
|
+
# Part - enhanced to support patterns like p1adde1 AND pt3r1 (part with revision)
|
|
849
|
+
rule(:part) do
|
|
850
|
+
((space.maybe >> (str("pt") | str("p") | str("P"))) | str(" Part ")) >>
|
|
851
|
+
(digits >>
|
|
852
|
+
# NEW: Revision after part number: pt3r1, p1r1 (space.maybe for preprocessing)
|
|
853
|
+
(space.maybe >> str("r") >> digits).maybe >>
|
|
854
|
+
# Existing: Addendum with optional edition: add, adde1
|
|
855
|
+
(str("add") >> (str("e") >> digits).maybe).maybe >>
|
|
856
|
+
(dash >> digits).maybe).as(:part)
|
|
857
|
+
end
|
|
858
|
+
|
|
859
|
+
# Revision
|
|
860
|
+
rule(:revision) do
|
|
861
|
+
# NEW: Revision with month and year: rJun1992, r Jun1992 - LONGEST MATCH FIRST
|
|
862
|
+
# Enhanced to support leading space (Session 219)
|
|
863
|
+
(space.maybe >> (str("r") | str("rev")) >> space.maybe >> month_abbrev.as(:revision_month) >> digits.as(:revision_year)) |
|
|
864
|
+
# Revision with slash and year: r6/1925, r11/1924 (NEW for LCIRC patterns)
|
|
865
|
+
(space.maybe >> (str("r") | str("rev")) >> digits.as(:revision) >>
|
|
866
|
+
slash >> digits.as(:revision_year)) |
|
|
867
|
+
# Revision with 4-digit year directly: r1995, r 1995 (allow space before year)
|
|
868
|
+
((str(" r") | str("r")) >> space.maybe >> match("[0-9]").repeat(4,
|
|
869
|
+
4).as(:revision_year)) |
|
|
870
|
+
# Revision with year: rev2013, rev 2013 (allow space before year)
|
|
871
|
+
(str("rev") >> space.maybe >> digits.as(:revision_year)) |
|
|
872
|
+
# Revision with digits AND/OR letters: r1a, r1A, ra, r1
|
|
873
|
+
# Enhanced to accept letter-only revisions and space before r
|
|
874
|
+
# ENHANCED: Accept BOTH lowercase and uppercase letters in suffix
|
|
875
|
+
# ENHANCED: Capture original format prefix for format preservation (e.g., " Rev. 5")
|
|
876
|
+
((str(" rev ") | str("rev") | str(" r") | str("r") | str(" Rev. ") | str(" Revision (r)")).as(:revision_prefix) >>
|
|
877
|
+
((digits >> match("[a-zA-Z]").maybe) | match("[a-zA-Z]").repeat(1)).as(:revision_id)).as(:revision) |
|
|
878
|
+
# NEW: Standalone 'r' - MUST BE LAST to avoid consuming from other patterns
|
|
879
|
+
# Matches " r" at end of input (after preprocessing: "800-56a r", "800-27 r")
|
|
880
|
+
(str(" r") >> any.absent?).as(:revision_standalone)
|
|
881
|
+
end
|
|
882
|
+
|
|
883
|
+
# Version - V1 SP PARSER COMPATIBLE
|
|
884
|
+
# Supports: ver1.0.2, ver2, " Ver. 2.0", " Version 1.0", v1.0.2, -v1.0, .ver1.0 (MR format)
|
|
885
|
+
rule(:version) do
|
|
886
|
+
# Verbose "ver" form - with or without dots (space.maybe before AND after "ver")
|
|
887
|
+
# ENHANCED: Accept dot prefix for MR format (e.g., "500-281.ver1.0")
|
|
888
|
+
((space | dot).maybe >> str("ver") >> space.maybe >> (digits >> (dot >> digits).repeat).as(:version)) |
|
|
889
|
+
# Verbose forms with space: " Ver. ", " Version " - require dots
|
|
890
|
+
((str(" Ver. ") | str(" Version ")) >>
|
|
891
|
+
(digits >> dot >> digits >> (dot >> digits).maybe).as(:version)) |
|
|
892
|
+
# Short form "v" with mandatory dots (v1.0, v1.0.2) - allow optional dash or space before
|
|
893
|
+
((dash | space).maybe >> str("v") >> (digits >> dot >> digits >> (dot >> digits).maybe).as(:version))
|
|
894
|
+
end
|
|
895
|
+
|
|
896
|
+
# Update - V1 COMPATIBLE
|
|
897
|
+
# Format: /Upd{N}-{YYYY}{MM} where MM is optional
|
|
898
|
+
# Examples: /Upd1-2015, /Upd3-202102, -upd, /upd (after preprocessing)
|
|
899
|
+
# Update number is optional (e.g., "500-300-upd" has no number)
|
|
900
|
+
# Captures prefix to preserve original format (-upd vs /Upd)
|
|
901
|
+
rule(:update) do
|
|
902
|
+
prefix = (
|
|
903
|
+
str("/Upd") |
|
|
904
|
+
(space.maybe >> (str("/upd") | str("-upd")))
|
|
905
|
+
).as(:update_prefix)
|
|
906
|
+
|
|
907
|
+
prefix >>
|
|
908
|
+
(
|
|
909
|
+
digits.as(:update_number).maybe >>
|
|
910
|
+
(dash >>
|
|
911
|
+
match("[0-9]").repeat(4, 4).as(:update_year) >>
|
|
912
|
+
match("[0-9]").repeat(2, 2).as(:update_month).maybe
|
|
913
|
+
).maybe
|
|
914
|
+
).as(:update)
|
|
915
|
+
end
|
|
916
|
+
|
|
917
|
+
# Addendum
|
|
918
|
+
rule(:addendum) do
|
|
919
|
+
((str("-add") | str(".add") | str(" Add.")) >>
|
|
920
|
+
(space | dash).maybe >> (digits | str("")).as(:addendum_number)).as(:addendum)
|
|
921
|
+
end
|
|
922
|
+
|
|
923
|
+
# Supplement - enhanced to support date patterns, year patterns, and combined with revision
|
|
924
|
+
# Examples: suppJan1924, supp3/1926, supp1925, supJun1925-Jun1927 (date ranges), supprev
|
|
925
|
+
rule(:supplement) do
|
|
926
|
+
space.maybe >>
|
|
927
|
+
(str("supp") | str("sup")) >>
|
|
928
|
+
(
|
|
929
|
+
# Supplement followed by revision: supprev
|
|
930
|
+
str("rev").as(:supplement_with_rev) |
|
|
931
|
+
# Date range pattern: Jan1924-Jan1926
|
|
932
|
+
(month_abbrev.as(:supp_month_start) >> digits.as(:supp_year_start) >>
|
|
933
|
+
dash >> month_abbrev.as(:supp_month_end) >> digits.as(:supp_year_end)).as(:supplement_date_range) |
|
|
934
|
+
# Month and year: Jan1924
|
|
935
|
+
(month_abbrev.as(:supp_month) >> digits.as(:supp_year)).as(:supplement_date) |
|
|
936
|
+
# Number with slash and year: 3/1926
|
|
937
|
+
(digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_slash_year) |
|
|
938
|
+
# Just year: 1925
|
|
939
|
+
digits.as(:supp_year) |
|
|
940
|
+
# General suffix (other patterns)
|
|
941
|
+
match("[A-Za-z0-9]").repeat(1).as(:supplement_suffix)
|
|
942
|
+
).maybe
|
|
943
|
+
end
|
|
944
|
+
|
|
945
|
+
# Errata
|
|
946
|
+
rule(:errata) do
|
|
947
|
+
(dash.maybe >> (str("errata") | str("err"))).as(:errata)
|
|
948
|
+
end
|
|
949
|
+
|
|
950
|
+
# Index
|
|
951
|
+
rule(:index) do
|
|
952
|
+
(str("index") | str("indx")).as(:index)
|
|
953
|
+
end
|
|
954
|
+
|
|
955
|
+
# Insert
|
|
956
|
+
rule(:insert) do
|
|
957
|
+
(str("insert") | str("ins")).as(:insert)
|
|
958
|
+
end
|
|
959
|
+
|
|
960
|
+
# Appendix
|
|
961
|
+
rule(:appendix) do
|
|
962
|
+
str("app").as(:appendix)
|
|
963
|
+
end
|
|
964
|
+
|
|
965
|
+
# Section - make digits optional for patterns like just "sec"
|
|
966
|
+
rule(:section) do
|
|
967
|
+
str("sec") >> digits.as(:section).maybe
|
|
968
|
+
end
|
|
969
|
+
|
|
970
|
+
# Translation (3-letter language code) - V1 COMPATIBLE
|
|
971
|
+
# Supports: (spa), " spa", ".spa" (MR format)
|
|
972
|
+
rule(:translation) do
|
|
973
|
+
# Parenthetical format: (spa), (por), (ind)
|
|
974
|
+
(str("(") >> match('\w').repeat(3, 3).as(:translation) >> str(")")) |
|
|
975
|
+
# Space-prefix format: " spa"
|
|
976
|
+
(space >> match('\w').repeat(3, 3).as(:translation)) |
|
|
977
|
+
# Dot-prefix format: ".spa" (machine-readable)
|
|
978
|
+
(dot >> match('\w').repeat(3, 3).as(:translation))
|
|
979
|
+
end
|
|
980
|
+
|
|
981
|
+
# Public draft suffix - for patterns like 2pd, 3pd
|
|
982
|
+
rule(:pd_suffix) do
|
|
983
|
+
(space >> digits >> str("pd")).as(:public_draft)
|
|
984
|
+
end
|
|
985
|
+
|
|
986
|
+
# Draft stage - enhanced to support suffix pattern and number after draft
|
|
987
|
+
# ENHANCED: Accept optional space before dash to match after report_number
|
|
988
|
+
rule(:draft) do
|
|
989
|
+
((space >> str("(Draft)")) |
|
|
990
|
+
(space.maybe >> dash >> str("draft") >> ((space >> digits) | digits).maybe) | # Match " -draft 2" OR "-draft2"
|
|
991
|
+
pd_suffix).as(:draft)
|
|
992
|
+
end
|
|
993
|
+
|
|
994
|
+
# Special date format with slash for FIPS (part of number, not edition)
|
|
995
|
+
rule(:fips_date) do
|
|
996
|
+
dash >> digits.as(:fips_part) >> dash >> month_abbrev.as(:fips_month) >>
|
|
997
|
+
digits.as(:fips_day) >> slash >> digits.as(:fips_year)
|
|
998
|
+
end
|
|
999
|
+
|
|
1000
|
+
# All possible parts (order matters!)
|
|
1001
|
+
rule(:parts) do
|
|
1002
|
+
# Put more specific patterns first
|
|
1003
|
+
# CRITICAL: new_stage BEFORE language_code to avoid "ipd" being treated as translation
|
|
1004
|
+
new_stage |
|
|
1005
|
+
section | index | insert | appendix | pd_suffix |
|
|
1006
|
+
edition | date | legacy_edition | revision |
|
|
1007
|
+
version | # MOVED BEFORE volume - try dotted versions (v1.1) before simple volumes (v1)
|
|
1008
|
+
volume | part | update | addendum |
|
|
1009
|
+
supplement | errata | language_code
|
|
1010
|
+
end
|
|
1011
|
+
|
|
1012
|
+
# CIRC Supplement identifier - split into base + supplement
|
|
1013
|
+
# Examples:
|
|
1014
|
+
# - "NBS CIRC 101e2supp" → base="NBS CIRC 101e2", supplement
|
|
1015
|
+
# - "NBS CIRC 25supp-1924" → base="NBS CIRC 25", supplement_year="1924"
|
|
1016
|
+
# - "NBS CIRC 24suppJan1924" → base="NBS CIRC 24", supplement_edition="Jan1924"
|
|
1017
|
+
# - "NBS CIRC suppJun1925-Jun1926" → date range supplement (no base)
|
|
1018
|
+
# - "NBS LCIRC 378Gsup" → base="NBS LCIRC 378G", supplement (no metadata)
|
|
1019
|
+
# - "NBS.LCIRC.378sup1/1927" → dot-separated MR format (after preprocessing)
|
|
1020
|
+
# Dot-separated machine-readable format: NIST.SP.800-116 or #NIST.2024-01-15.123
|
|
1021
|
+
# Enhanced to support parts after number like NIST.SP.1011-I-2.0
|
|
1022
|
+
# Enhanced to support revision+update patterns like NIST.IR.8115r1-upd
|
|
1023
|
+
rule(:mr_identifier) do
|
|
1024
|
+
hash_prefix.maybe >>
|
|
1025
|
+
publisher >> dot >>
|
|
1026
|
+
simple_series >> dot >>
|
|
1027
|
+
report_number >>
|
|
1028
|
+
# Edition with underscore separator (MR format: 1648_2009)
|
|
1029
|
+
(str("_") >> digits.as(:edition_year)).maybe >>
|
|
1030
|
+
# Support letter suffix before update (e.g., 8286C-upd1) - Session 219
|
|
1031
|
+
upper_letter.maybe >>
|
|
1032
|
+
# Support revision component (r1, r5, etc.) before update
|
|
1033
|
+
edition.maybe >>
|
|
1034
|
+
update.maybe >>
|
|
1035
|
+
# Additional dot-separated parts (parts, version, volume, etc.)
|
|
1036
|
+
# MUST come before translation to avoid conflicting with language codes
|
|
1037
|
+
(dot >> (digits | upper_letter)).repeat(0, 3) >>
|
|
1038
|
+
# Language codes at end (.spa, .por, .ind)
|
|
1039
|
+
parts.repeat >> draft.maybe
|
|
1040
|
+
end
|
|
1041
|
+
|
|
1042
|
+
# Main identifier structure
|
|
1043
|
+
# Try compound series first (longest match), then publisher + simple series
|
|
1044
|
+
rule(:identifier) do
|
|
1045
|
+
circ_supplement_identifier |
|
|
1046
|
+
mr_identifier |
|
|
1047
|
+
(
|
|
1048
|
+
# Compound series (includes publisher in series name)
|
|
1049
|
+
compound_series >> (space | dot) >>
|
|
1050
|
+
old_stage.maybe >> # Old style stage after series
|
|
1051
|
+
report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
|
|
1052
|
+
) |
|
|
1053
|
+
(
|
|
1054
|
+
# Publisher + simple series - require space/dot between publisher and series
|
|
1055
|
+
publisher >> (space | dot) >>
|
|
1056
|
+
simple_series >>
|
|
1057
|
+
old_stage.maybe >> # Old style stage after series
|
|
1058
|
+
(space | dot) >>
|
|
1059
|
+
report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
|
|
1060
|
+
) |
|
|
1061
|
+
(
|
|
1062
|
+
# Simple series only (no publisher)
|
|
1063
|
+
simple_series >>
|
|
1064
|
+
old_stage.maybe >> # Old style stage after series
|
|
1065
|
+
(space | dot) >>
|
|
1066
|
+
report_number.maybe >> fips_date.maybe >> parts.repeat >> draft.maybe >> translation.maybe >> new_stage.maybe
|
|
1067
|
+
)
|
|
1068
|
+
end
|
|
1069
|
+
|
|
1070
|
+
# CIRC Supplement identifier - split into base + supplement
|
|
1071
|
+
# Must be complete rule with all patterns
|
|
1072
|
+
rule(:circ_supplement_identifier) do
|
|
1073
|
+
(
|
|
1074
|
+
(str("NBS CIRC") | str("NBS LCIRC") | str("NBS.CIRC") | str("NBS.LCIRC")).as(:series) >>
|
|
1075
|
+
(space | dot)
|
|
1076
|
+
).as(:circ_series) >>
|
|
1077
|
+
(
|
|
1078
|
+
# Date range supplement (no base number)
|
|
1079
|
+
(str("supp") >> month_abbrev.as(:supp_month_start) >> digits.as(:supp_year_start) >>
|
|
1080
|
+
dash >> month_abbrev.as(:supp_month_end) >> digits.as(:supp_year_end)).as(:supplement_date_range) |
|
|
1081
|
+
# With base identifier + supplement
|
|
1082
|
+
(
|
|
1083
|
+
# Capture base portion (everything before "supp" or "sup" or slash+year)
|
|
1084
|
+
(
|
|
1085
|
+
# Number with edition: "101e2"
|
|
1086
|
+
(digits.as(:base_number) >> str("e") >> digits.as(:edition_number)) |
|
|
1087
|
+
# Number with revision (for supplement patterns): "145r11"
|
|
1088
|
+
(digits.as(:base_number) >> lower_letter.as(:revision_letter) >> digits.as(:revision_number)) |
|
|
1089
|
+
# Number with letter suffix: "378G"
|
|
1090
|
+
(digits.as(:base_number) >> upper_letter.as(:letter_suffix)) |
|
|
1091
|
+
# Just number: "25", "24"
|
|
1092
|
+
digits.as(:simple_number)
|
|
1093
|
+
).as(:base_portion) >>
|
|
1094
|
+
# Supplement marker - support both "supp" and "sup", OR implicit supplement via slash+year
|
|
1095
|
+
(
|
|
1096
|
+
# Explicit supplement marker
|
|
1097
|
+
((str("supp") | str("sup")) >>
|
|
1098
|
+
# Optional supplement metadata
|
|
1099
|
+
(
|
|
1100
|
+
(month_abbrev >> digits).as(:supplement_month_year) |
|
|
1101
|
+
# Dash + number + slash + year (e.g., supp-12/1926)
|
|
1102
|
+
(dash >> digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_dash_slash_year) |
|
|
1103
|
+
(dash >> digits.as(:supplement_year)) |
|
|
1104
|
+
(digits.as(:supp_number) >> slash >> digits.as(:supp_year)).as(:supplement_slash_year) |
|
|
1105
|
+
str("").as(:supplement_empty)
|
|
1106
|
+
).maybe) |
|
|
1107
|
+
# Implicit supplement via slash+year (e.g., "145r11/1925")
|
|
1108
|
+
(slash >> digits.as(:implicit_supplement_year)).as(:implicit_supplement)
|
|
1109
|
+
)
|
|
1110
|
+
)
|
|
1111
|
+
)
|
|
1112
|
+
end
|
|
1113
|
+
|
|
1114
|
+
root(:identifier)
|
|
1115
|
+
end
|
|
1116
|
+
end
|
|
1117
|
+
end
|