atlas_engine 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +123 -0
- data/Rakefile +20 -0
- data/app/assets/config/atlas_engine_manifest.js +3 -0
- data/app/assets/stylesheets/atlas_engine/application.css +15 -0
- data/app/concerns/atlas_engine/handles_blob.rb +26 -0
- data/app/concerns/atlas_engine/handles_interruption.rb +22 -0
- data/app/controllers/atlas_engine/application_controller.rb +7 -0
- data/app/controllers/atlas_engine/connectivity_controller.rb +21 -0
- data/app/controllers/atlas_engine/country_imports_controller.rb +73 -0
- data/app/controllers/atlas_engine/graphql_controller.rb +59 -0
- data/app/countries/atlas_engine/ar/country_profile.yml +9 -0
- data/app/countries/atlas_engine/at/address_importer/corrections/open_address/city_corrector.rb +23 -0
- data/app/countries/atlas_engine/at/country_profile.yml +24 -0
- data/app/countries/atlas_engine/at/index_configuration.yml +63 -0
- data/app/countries/atlas_engine/at/synonyms.yml +6 -0
- data/app/countries/atlas_engine/at/validation_transcriber/address_parser.rb +58 -0
- data/app/countries/atlas_engine/au/address_importer/open_address/filter.rb +26 -0
- data/app/countries/atlas_engine/au/address_importer/open_address/mapper.rb +41 -0
- data/app/countries/atlas_engine/au/country_profile.yml +13 -0
- data/app/countries/atlas_engine/au/synonyms.yml +209 -0
- data/app/countries/atlas_engine/au/validation_transcriber/address_parser.rb +121 -0
- data/app/countries/atlas_engine/be/country_profile.yml +12 -0
- data/app/countries/atlas_engine/bm/address_importer/corrections/open_address/city_alias_corrector.rb +38 -0
- data/app/countries/atlas_engine/bm/address_importer/open_address/mapper.rb +40 -0
- data/app/countries/atlas_engine/bm/country_profile.yml +12 -0
- data/app/countries/atlas_engine/br/country_profile.yml +4 -0
- data/app/countries/atlas_engine/ca/country_profile.yml +7 -0
- data/app/countries/atlas_engine/ca/synonyms.yml +1615 -0
- data/app/countries/atlas_engine/ch/address_importer/corrections/open_address/city_corrector.rb +29 -0
- data/app/countries/atlas_engine/ch/address_importer/corrections/open_address/locale_corrector.rb +74 -0
- data/app/countries/atlas_engine/ch/address_importer/open_address/mapper.rb +40 -0
- data/app/countries/atlas_engine/ch/country_profile.yml +15 -0
- data/app/countries/atlas_engine/ch/locales/de/country_profile.yml +15 -0
- data/app/countries/atlas_engine/ch/locales/de/index_configuration.yml +63 -0
- data/app/countries/atlas_engine/ch/locales/de/synonyms.yml +7 -0
- data/app/countries/atlas_engine/ch/locales/fr/synonyms.yml +21 -0
- data/app/countries/atlas_engine/cz/country_profile.yml +6 -0
- data/app/countries/atlas_engine/de/country_profile.yml +19 -0
- data/app/countries/atlas_engine/de/index_configuration.yml +64 -0
- data/app/countries/atlas_engine/de/synonyms.yml +2 -0
- data/app/countries/atlas_engine/de/validation_transcriber/address_parser.rb +19 -0
- data/app/countries/atlas_engine/dk/country_profile.yml +6 -0
- data/app/countries/atlas_engine/dk/synonyms.yml +3 -0
- data/app/countries/atlas_engine/dk/validation_transcriber/address_parser.rb +21 -0
- data/app/countries/atlas_engine/fo/country_profile.yml +5 -0
- data/app/countries/atlas_engine/fr/address_importer/corrections/open_address/city_corrector.rb +28 -0
- data/app/countries/atlas_engine/fr/country_profile.yml +13 -0
- data/app/countries/atlas_engine/fr/synonyms.yml +21 -0
- data/app/countries/atlas_engine/fr/validation_transcriber/address_parser.rb +34 -0
- data/app/countries/atlas_engine/gb/address_validation/es/query_builder.rb +98 -0
- data/app/countries/atlas_engine/gb/country_profile.yml +10 -0
- data/app/countries/atlas_engine/gb/validation_transcriber/full_address_parser.rb +164 -0
- data/app/countries/atlas_engine/gb/validation_transcriber/parsed_address.rb +120 -0
- data/app/countries/atlas_engine/gg/address_validation/validators/full_address/restrictions/unsupported_city.rb +39 -0
- data/app/countries/atlas_engine/gg/country_profile.yml +7 -0
- data/app/countries/atlas_engine/ie/country_profile.yml +3 -0
- data/app/countries/atlas_engine/it/address_importer/corrections/open_address/city_corrector.rb +27 -0
- data/app/countries/atlas_engine/it/address_importer/corrections/open_address/province_corrector.rb +29 -0
- data/app/countries/atlas_engine/it/address_importer/open_address/mapper.rb +42 -0
- data/app/countries/atlas_engine/it/country_profile.yml +11 -0
- data/app/countries/atlas_engine/jp/address_validation/es/data_mapper.rb +63 -0
- data/app/countries/atlas_engine/jp/country_profile.yml +6 -0
- data/app/countries/atlas_engine/kr/address_importer/open_address/mapper.rb +41 -0
- data/app/countries/atlas_engine/kr/country_profile.yml +11 -0
- data/app/countries/atlas_engine/li/address_importer/corrections/open_address/city_corrector.rb +25 -0
- data/app/countries/atlas_engine/li/country_profile.yml +21 -0
- data/app/countries/atlas_engine/li/index_configuration.yml +63 -0
- data/app/countries/atlas_engine/li/synonyms.yml +6 -0
- data/app/countries/atlas_engine/lt/country_profile.yml +6 -0
- data/app/countries/atlas_engine/lt/synonyms.yml +7 -0
- data/app/countries/atlas_engine/lt/validation_transcriber/address_parser.rb +24 -0
- data/app/countries/atlas_engine/lu/address_importer/corrections/open_address/locale_corrector.rb +54 -0
- data/app/countries/atlas_engine/lu/country_profile.yml +12 -0
- data/app/countries/atlas_engine/nl/address_importer/corrections/open_address/city_corrector.rb +25 -0
- data/app/countries/atlas_engine/nl/country_profile.yml +18 -0
- data/app/countries/atlas_engine/nl/index_configuration.yml +52 -0
- data/app/countries/atlas_engine/nl/synonyms.yml +92 -0
- data/app/countries/atlas_engine/nl/validation_transcriber/address_parser.rb +85 -0
- data/app/countries/atlas_engine/no/country_profile.yml +5 -0
- data/app/countries/atlas_engine/nz/country_profile.yml +3 -0
- data/app/countries/atlas_engine/pl/country_profile.yml +5 -0
- data/app/countries/atlas_engine/pl/validation_transcriber/address_parser.rb +19 -0
- data/app/countries/atlas_engine/pt/address_importer/corrections/open_address/city_corrector.rb +32 -0
- data/app/countries/atlas_engine/pt/address_importer/open_address/mapper.rb +39 -0
- data/app/countries/atlas_engine/pt/country_profile.yml +10 -0
- data/app/countries/atlas_engine/pt/synonyms.yml +7 -0
- data/app/countries/atlas_engine/sa/country_profile.yml +10 -0
- data/app/countries/atlas_engine/se/country_profile.yml +5 -0
- data/app/countries/atlas_engine/tt/address_importer/open_address/mapper.rb +38 -0
- data/app/countries/atlas_engine/tt/country_profile.yml +7 -0
- data/app/countries/atlas_engine/us/country_profile.yml +12 -0
- data/app/countries/atlas_engine/us/synonyms.yml +350 -0
- data/app/graphql/atlas_engine/errors/locale_unsupported_error.rb +17 -0
- data/app/graphql/atlas_engine/schema.graphql +1293 -0
- data/app/graphql/atlas_engine/schema.rb +23 -0
- data/app/graphql/atlas_engine/types/address_validation/address_input.rb +51 -0
- data/app/graphql/atlas_engine/types/address_validation/concern_type.rb +20 -0
- data/app/graphql/atlas_engine/types/address_validation/enums/concern_enum.rb +15 -0
- data/app/graphql/atlas_engine/types/address_validation/field_type.rb +15 -0
- data/app/graphql/atlas_engine/types/address_validation/suggestion_type.rb +21 -0
- data/app/graphql/atlas_engine/types/base_argument.rb +9 -0
- data/app/graphql/atlas_engine/types/base_enum.rb +9 -0
- data/app/graphql/atlas_engine/types/base_field.rb +10 -0
- data/app/graphql/atlas_engine/types/base_input_object.rb +9 -0
- data/app/graphql/atlas_engine/types/base_interface.rb +10 -0
- data/app/graphql/atlas_engine/types/base_object.rb +9 -0
- data/app/graphql/atlas_engine/types/base_scalar.rb +9 -0
- data/app/graphql/atlas_engine/types/base_union.rb +9 -0
- data/app/graphql/atlas_engine/types/matching_strategy_type.rb +12 -0
- data/app/graphql/atlas_engine/types/mutation_type.rb +9 -0
- data/app/graphql/atlas_engine/types/query_type.rb +61 -0
- data/app/graphql/atlas_engine/types/validation_supported_country.rb +12 -0
- data/app/graphql/atlas_engine/types/validation_type.rb +22 -0
- data/app/helpers/atlas_engine/address_importer/import_log_helper.rb +66 -0
- data/app/helpers/atlas_engine/application_helper.rb +7 -0
- data/app/helpers/atlas_engine/locale_format_helper.rb +40 -0
- data/app/helpers/atlas_engine/log_base.rb +32 -0
- data/app/helpers/atlas_engine/log_helper.rb +24 -0
- data/app/helpers/atlas_engine/metrics_helper.rb +25 -0
- data/app/jobs/atlas_engine/address_importer/clear_records_job.rb +39 -0
- data/app/jobs/atlas_engine/address_importer/open_address/geo_json_import_job.rb +212 -0
- data/app/jobs/atlas_engine/address_importer/open_address/geo_json_import_launcher_job.rb +67 -0
- data/app/jobs/atlas_engine/address_importer/open_address/prepares_geo_json_file.rb +41 -0
- data/app/jobs/atlas_engine/address_importer/resumable_import_job.rb +49 -0
- data/app/jobs/atlas_engine/address_importer/street_backfill_job.rb +63 -0
- data/app/jobs/atlas_engine/application_job.rb +10 -0
- data/app/jobs/atlas_engine/concerns/address_importer/handles_errors.rb +43 -0
- data/app/lib/atlas_engine/concern_formatter.rb +40 -0
- data/app/lib/atlas_engine/restrictions/base.rb +20 -0
- data/app/lib/atlas_engine/restrictions/unsupported_script.rb +31 -0
- data/app/lib/atlas_engine/validation_transcriber/address_parser_base.rb +201 -0
- data/app/lib/atlas_engine/validation_transcriber/address_parser_factory.rb +27 -0
- data/app/lib/atlas_engine/validation_transcriber/address_parser_north_america.rb +39 -0
- data/app/lib/atlas_engine/validation_transcriber/address_parser_oceanic.rb +17 -0
- data/app/lib/atlas_engine/validation_transcriber/address_parser_preprocessor.rb +132 -0
- data/app/lib/atlas_engine/validation_transcriber/address_parsing_helper.rb +38 -0
- data/app/lib/atlas_engine/validation_transcriber/address_parsings.rb +54 -0
- data/app/lib/atlas_engine/validation_transcriber/constants.rb +50 -0
- data/app/lib/atlas_engine/validation_transcriber/english_street_parser.rb +59 -0
- data/app/lib/atlas_engine/validation_transcriber/formatter.rb +46 -0
- data/app/lib/atlas_engine/validation_transcriber/french_street_parser.rb +50 -0
- data/app/lib/atlas_engine/validation_transcriber/province_code_normalizer.rb +45 -0
- data/app/lib/atlas_engine/validation_transcriber/street_parser.rb +18 -0
- data/app/lib/atlas_engine/validation_transcriber/zip_normalizer.rb +23 -0
- data/app/mailers/atlas_engine/application_mailer.rb +9 -0
- data/app/models/atlas_engine/address_importer/corrections/corrector.rb +33 -0
- data/app/models/atlas_engine/address_importer/import_events_notifier/base.rb +35 -0
- data/app/models/atlas_engine/address_importer/import_events_notifier/notifier.rb +26 -0
- data/app/models/atlas_engine/address_importer/open_address/default_mapper.rb +46 -0
- data/app/models/atlas_engine/address_importer/open_address/feature_helper.rb +110 -0
- data/app/models/atlas_engine/address_importer/open_address/filter.rb +17 -0
- data/app/models/atlas_engine/address_importer/open_address/loader.rb +27 -0
- data/app/models/atlas_engine/address_importer/open_address/transformer.rb +39 -0
- data/app/models/atlas_engine/address_importer/open_address.rb +10 -0
- data/app/models/atlas_engine/address_importer/validation/base_validator.rb +86 -0
- data/app/models/atlas_engine/address_importer/validation/default_validator.rb +27 -0
- data/app/models/atlas_engine/address_importer/validation/field_validations/city.rb +47 -0
- data/app/models/atlas_engine/address_importer/validation/field_validations/interface.rb +29 -0
- data/app/models/atlas_engine/address_importer/validation/field_validations/province.rb +73 -0
- data/app/models/atlas_engine/address_importer/validation/field_validations/zip.rb +84 -0
- data/app/models/atlas_engine/address_importer/validation/validator.rb +17 -0
- data/app/models/atlas_engine/address_importer/validation/wrapper.rb +70 -0
- data/app/models/atlas_engine/address_number.rb +36 -0
- data/app/models/atlas_engine/address_number_range.rb +200 -0
- data/app/models/atlas_engine/address_validation/abstract_address.rb +49 -0
- data/app/models/atlas_engine/address_validation/address.rb +47 -0
- data/app/models/atlas_engine/address_validation/candidate.rb +109 -0
- data/app/models/atlas_engine/address_validation/candidate_tuple.rb +15 -0
- data/app/models/atlas_engine/address_validation/concern.rb +74 -0
- data/app/models/atlas_engine/address_validation/concern_producer.rb +19 -0
- data/app/models/atlas_engine/address_validation/concern_queue.rb +20 -0
- data/app/models/atlas_engine/address_validation/concern_record.rb +122 -0
- data/app/models/atlas_engine/address_validation/datastore_base.rb +27 -0
- data/app/models/atlas_engine/address_validation/errors.rb +13 -0
- data/app/models/atlas_engine/address_validation/es/candidate_selector.rb +70 -0
- data/app/models/atlas_engine/address_validation/es/data_mappers/decompounding_data_mapper.rb +39 -0
- data/app/models/atlas_engine/address_validation/es/data_mappers/default_data_mapper.rb +110 -0
- data/app/models/atlas_engine/address_validation/es/datastore.rb +229 -0
- data/app/models/atlas_engine/address_validation/es/default_query_builder.rb +30 -0
- data/app/models/atlas_engine/address_validation/es/query_builder.rb +160 -0
- data/app/models/atlas_engine/address_validation/es/term_vectors.rb +78 -0
- data/app/models/atlas_engine/address_validation/es/validators/full_address.rb +123 -0
- data/app/models/atlas_engine/address_validation/es/validators/full_address_street.rb +18 -0
- data/app/models/atlas_engine/address_validation/es/validators/restriction_evaluator.rb +37 -0
- data/app/models/atlas_engine/address_validation/field.rb +30 -0
- data/app/models/atlas_engine/address_validation/full_address_validator_base.rb +27 -0
- data/app/models/atlas_engine/address_validation/log_emitter.rb +66 -0
- data/app/models/atlas_engine/address_validation/matching_strategies.rb +16 -0
- data/app/models/atlas_engine/address_validation/normalizer.rb +38 -0
- data/app/models/atlas_engine/address_validation/predicate_pipeline.rb +80 -0
- data/app/models/atlas_engine/address_validation/request.rb +12 -0
- data/app/models/atlas_engine/address_validation/result.rb +154 -0
- data/app/models/atlas_engine/address_validation/runs_validation.rb +16 -0
- data/app/models/atlas_engine/address_validation/session.rb +47 -0
- data/app/models/atlas_engine/address_validation/statsd_emitter.rb +72 -0
- data/app/models/atlas_engine/address_validation/strategies.rb +10 -0
- data/app/models/atlas_engine/address_validation/suggestion.rb +97 -0
- data/app/models/atlas_engine/address_validation/token/comparator.rb +44 -0
- data/app/models/atlas_engine/address_validation/token/comparison.rb +76 -0
- data/app/models/atlas_engine/address_validation/token/sequence/comparator.rb +158 -0
- data/app/models/atlas_engine/address_validation/token/sequence/comparison.rb +166 -0
- data/app/models/atlas_engine/address_validation/token/sequence.rb +147 -0
- data/app/models/atlas_engine/address_validation/token/synonyms.rb +77 -0
- data/app/models/atlas_engine/address_validation/token.rb +113 -0
- data/app/models/atlas_engine/address_validation/validator.rb +147 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/address_comparison.rb +97 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/candidate_result.rb +164 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/candidate_result_base.rb +46 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/comparison_helper.rb +135 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/components_to_validate.rb +88 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/concern_builder.rb +127 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/exclusions/exclusion_base.rb +23 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/invalid_zip_concern_builder.rb +42 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/invalid_zip_for_country_concern.rb +37 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/invalid_zip_for_province_concern.rb +37 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/no_candidate_result.rb +26 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/number_comparison.rb +31 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/postal_code_matcher.rb +60 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/result_updater.rb +42 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/suggestion_builder.rb +140 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/unknown_address_concern.rb +30 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/unknown_province_concern.rb +38 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/unknown_zip_for_address_concern.rb +32 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/unmatched_field_concern.rb +84 -0
- data/app/models/atlas_engine/address_validation/validators/full_address/unsupported_script_result.rb +22 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/cache.rb +38 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/city/present.rb +36 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/country/exists.rb +34 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/country/valid_for_zip.rb +60 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/no_emojis.rb +38 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/no_html_tags.rb +39 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/no_url.rb +38 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/not_exceed_max_length.rb +34 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/not_exceed_max_token_count.rb +63 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/phone/valid.rb +41 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/predicate.rb +37 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/province/exists.rb +43 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/province/valid_for_country.rb +48 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/street/building_number_in_address1.rb +45 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/street/building_number_in_address1_or_address2.rb +43 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/street/present.rb +35 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/zip/present.rb +58 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/zip/valid_for_country.rb +45 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/zip/valid_for_province.rb +55 -0
- data/app/models/atlas_engine/address_validation/validators/predicates/zip/zip_base.rb +25 -0
- data/app/models/atlas_engine/address_validation/zip_truncator.rb +32 -0
- data/app/models/atlas_engine/application_record.rb +8 -0
- data/app/models/atlas_engine/coded_error.rb +18 -0
- data/app/models/atlas_engine/coded_errors.rb +17 -0
- data/app/models/atlas_engine/country_import.rb +44 -0
- data/app/models/atlas_engine/country_profile.rb +270 -0
- data/app/models/atlas_engine/country_profile_ingestion_subset.rb +42 -0
- data/app/models/atlas_engine/country_profile_subset_base.rb +22 -0
- data/app/models/atlas_engine/country_profile_validation_subset.rb +48 -0
- data/app/models/atlas_engine/country_repository.rb +110 -0
- data/app/models/atlas_engine/elasticsearch/client.rb +116 -0
- data/app/models/atlas_engine/elasticsearch/client_interface.rb +89 -0
- data/app/models/atlas_engine/elasticsearch/repository.rb +246 -0
- data/app/models/atlas_engine/elasticsearch/repository_interface.rb +82 -0
- data/app/models/atlas_engine/elasticsearch/response.rb +20 -0
- data/app/models/atlas_engine/event.rb +12 -0
- data/app/models/atlas_engine/field_decompounder.rb +36 -0
- data/app/models/atlas_engine/index_configuration_factory.rb +188 -0
- data/app/models/atlas_engine/post_address.rb +114 -0
- data/app/models/atlas_engine/post_address_importer.rb +34 -0
- data/app/models/atlas_engine/services/service_helper.rb +21 -0
- data/app/models/atlas_engine/services/validation.rb +65 -0
- data/app/models/atlas_engine/services/validation_eligibility.rb +18 -0
- data/app/models/atlas_engine/street.rb +34 -0
- data/app/tasks/maintenance/atlas_engine/elasticsearch_index_create_task.rb +106 -0
- data/app/tasks/maintenance/atlas_engine/geo_json_import_task.rb +29 -0
- data/app/views/atlas_engine/connectivity/index.html.erb +50 -0
- data/app/views/atlas_engine/country_imports/index.html.erb +49 -0
- data/app/views/atlas_engine/country_imports/show.html.erb +73 -0
- data/app/views/layouts/atlas_engine/application.html.erb +15 -0
- data/config/initializers/1.ruby_patches.rb +18 -0
- data/config/initializers/sorbet.rb +5 -0
- data/config/initializers/worldwide.rb +5 -0
- data/config/locales/internal/en.yml +14 -0
- data/config/routes.rb +17 -0
- data/db/data/address_synonyms/index_configurations/default.yml +141 -0
- data/db/data/country_profiles/default.yml +23 -0
- data/db/data/transcriber.yml +760 -0
- data/db/data/validation_pipelines/es.yml +58 -0
- data/db/data/validation_pipelines/es_street.yml +58 -0
- data/db/data/validation_pipelines/local.yml +60 -0
- data/db/migrate/20230919173037_create_atlas_engine_post_addresses.rb +25 -0
- data/db/migrate/20231117142735_add_building_and_unit_ranges_column.rb +7 -0
- data/db/migrate/20231117143536_create_atlas_engine_country_imports.rb +11 -0
- data/db/migrate/20231117145844_create_atlas_engine_events_table.rb +13 -0
- data/db/migrate/20231123153554_add_unique_index_to_atlas_engine_post_addresses.rb +14 -0
- data/db/migrate/20231123154658_add_index_to_post_addresses_on_source_id_locale_country_code.rb +12 -0
- data/lib/atlas_engine/engine.rb +10 -0
- data/lib/atlas_engine/version.rb +6 -0
- data/lib/atlas_engine.rb +66 -0
- data/lib/tasks/atlas_engine/address_importer.rake +20 -0
- metadata +553 -0
@@ -0,0 +1,44 @@
|
|
1
|
+
# typed: true
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "rubygems/text"
|
5
|
+
|
6
|
+
module AtlasEngine
|
7
|
+
module AddressValidation
|
8
|
+
class Token
|
9
|
+
class Comparator
|
10
|
+
extend T::Sig
|
11
|
+
include Gem::Text
|
12
|
+
|
13
|
+
sig { returns(Token) }
|
14
|
+
attr_reader :left, :right
|
15
|
+
|
16
|
+
sig { params(left_token: Token, right_token: Token).void }
|
17
|
+
def initialize(left_token, right_token)
|
18
|
+
@left = T.let(left_token, Token)
|
19
|
+
@right = T.let(right_token, Token)
|
20
|
+
end
|
21
|
+
|
22
|
+
sig { returns(Comparison) }
|
23
|
+
def compare
|
24
|
+
left_value = left.value
|
25
|
+
right_value = right.value
|
26
|
+
|
27
|
+
if left_value == right_value
|
28
|
+
Comparison.new(left: left, right: right, qualifier: :equal, edit_distance: 0)
|
29
|
+
else
|
30
|
+
edit = levenshtein_distance(left_value, right_value)
|
31
|
+
|
32
|
+
if right_value.start_with?(left_value) || left_value.start_with?(right_value)
|
33
|
+
Comparison.new(left: left, right: right, qualifier: :prefix, edit_distance: edit)
|
34
|
+
elsif right_value.end_with?(left_value) || left_value.end_with?(right_value)
|
35
|
+
Comparison.new(left: left, right: right, qualifier: :suffix, edit_distance: edit)
|
36
|
+
else
|
37
|
+
Comparison.new(left: left, right: right, qualifier: :comp, edit_distance: edit)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# typed: true
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module AtlasEngine
|
5
|
+
module AddressValidation
|
6
|
+
class Token
|
7
|
+
class Comparison
|
8
|
+
extend T::Sig
|
9
|
+
include Comparable
|
10
|
+
|
11
|
+
QUALIFIERS = T.let(
|
12
|
+
[:equal, :prefix, :suffix, :comp].freeze,
|
13
|
+
T::Array[Symbol],
|
14
|
+
)
|
15
|
+
|
16
|
+
sig { returns(Token) }
|
17
|
+
attr_reader :left, :right
|
18
|
+
|
19
|
+
sig { returns(Symbol) }
|
20
|
+
attr_reader :qualifier
|
21
|
+
|
22
|
+
sig { returns(Integer) }
|
23
|
+
attr_reader :edit_distance
|
24
|
+
|
25
|
+
sig do
|
26
|
+
params(
|
27
|
+
left: Token,
|
28
|
+
right: Token,
|
29
|
+
qualifier: Symbol,
|
30
|
+
edit_distance: Integer,
|
31
|
+
).void
|
32
|
+
end
|
33
|
+
def initialize(left:, right:, qualifier:, edit_distance:)
|
34
|
+
raise "Unknown qualifier" unless qualifier.in?(QUALIFIERS)
|
35
|
+
|
36
|
+
@left = left
|
37
|
+
@right = right
|
38
|
+
@qualifier = qualifier
|
39
|
+
@edit_distance = edit_distance
|
40
|
+
end
|
41
|
+
|
42
|
+
sig { params(other: Comparison).returns(Integer) }
|
43
|
+
def <=>(other)
|
44
|
+
if edit_distance == other.edit_distance
|
45
|
+
qualifier_rank <=> other.qualifier_rank
|
46
|
+
else
|
47
|
+
edit_distance <=> other.edit_distance
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
sig { returns(T::Boolean) }
|
52
|
+
def equal?
|
53
|
+
qualifier == :equal
|
54
|
+
end
|
55
|
+
|
56
|
+
sig { params(other: Comparison).returns(T::Boolean) }
|
57
|
+
def preceeds?(other)
|
58
|
+
left.preceeds?(other.left) && right.preceeds?(other.right)
|
59
|
+
end
|
60
|
+
|
61
|
+
sig { returns(String) }
|
62
|
+
def inspect
|
63
|
+
"<comp left:#{left.inspect} #{qualifier.to_s.upcase} right:#{right.inspect} edit:#{edit_distance}/>"
|
64
|
+
end
|
65
|
+
|
66
|
+
protected
|
67
|
+
|
68
|
+
sig { returns(Integer) }
|
69
|
+
def qualifier_rank
|
70
|
+
# Constructor verifies that qualifier is in the list
|
71
|
+
T.must(QUALIFIERS.find_index(qualifier))
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
# typed: true
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module AtlasEngine
|
5
|
+
module AddressValidation
|
6
|
+
class Token
|
7
|
+
class Sequence
|
8
|
+
class Comparator
|
9
|
+
extend T::Sig
|
10
|
+
|
11
|
+
sig { returns(Sequence) }
|
12
|
+
attr_reader :left, :right
|
13
|
+
|
14
|
+
attr_reader :comparison_cache
|
15
|
+
|
16
|
+
MAX_ALLOWED_EDIT_DISTANCE_PERCENT = 0.5
|
17
|
+
|
18
|
+
sig { params(left_sequence: Sequence, right_sequence: Sequence).void }
|
19
|
+
def initialize(left_sequence:, right_sequence:)
|
20
|
+
@left = left_sequence
|
21
|
+
@right = right_sequence
|
22
|
+
@comparison_cache = Hash.new do |h, (l_tok, r_tok)|
|
23
|
+
h[[l_tok, r_tok]] = AddressValidation::Token::Comparator.new(l_tok, r_tok).compare
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
sig { returns(Comparison) }
|
28
|
+
def compare
|
29
|
+
result = left.permutations.product(right.permutations).map do |left_permutation, right_permutation|
|
30
|
+
flattened_sequence_compare(left_permutation, right_permutation)
|
31
|
+
end
|
32
|
+
|
33
|
+
T.must(result.min)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
sig do
|
39
|
+
params(
|
40
|
+
left_permutations: T::Array[Token],
|
41
|
+
right_permutations: T::Array[Token],
|
42
|
+
).returns(T::Array[Token::Comparison])
|
43
|
+
end
|
44
|
+
def token_comparisons(left_permutations, right_permutations)
|
45
|
+
left_permutations.product(right_permutations).map do |l_tok, r_tok|
|
46
|
+
comparison_cache[[l_tok, r_tok]]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
sig do
|
51
|
+
params(
|
52
|
+
token_comparisons: T::Array[Token::Comparison],
|
53
|
+
).returns(T::Array[Token::Comparison])
|
54
|
+
end
|
55
|
+
def sort_token_comparisons(token_comparisons)
|
56
|
+
token_comparisons.sort do |a, b|
|
57
|
+
comp = a <=> b
|
58
|
+
|
59
|
+
if comp == 0
|
60
|
+
(a.left.position + a.right.position) <=> (b.left.position + b.right.position)
|
61
|
+
else
|
62
|
+
comp
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
sig do
|
68
|
+
params(
|
69
|
+
left_permutation: T::Array[Token],
|
70
|
+
right_permutation: T::Array[Token],
|
71
|
+
).returns(Sequence::Comparison)
|
72
|
+
end
|
73
|
+
def flattened_sequence_compare(left_permutation, right_permutation)
|
74
|
+
token_comparisons = token_comparisons(left_permutation, right_permutation)
|
75
|
+
sorted_token_comparisons = sort_token_comparisons(token_comparisons)
|
76
|
+
|
77
|
+
filtered_token_comparisons = []
|
78
|
+
|
79
|
+
until sorted_token_comparisons.empty?
|
80
|
+
closest_match = sorted_token_comparisons.shift
|
81
|
+
|
82
|
+
if tokens_match_by_edit_distance?(comparison: T.must(closest_match))
|
83
|
+
filtered_token_comparisons << closest_match
|
84
|
+
end
|
85
|
+
|
86
|
+
sorted_token_comparisons.delete_if do |comparison|
|
87
|
+
same_token_or_position?(comparison.left, T.must(closest_match).left) ||
|
88
|
+
same_token_or_position?(comparison.right, T.must(closest_match).right)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
sorted_token_comparisons = filtered_token_comparisons.sort do |token1, token2|
|
93
|
+
token1.left.position <=> token2.left.position
|
94
|
+
end
|
95
|
+
|
96
|
+
Comparison.new(
|
97
|
+
unmatched_tokens: unmatched_tokens(left_permutation, right_permutation, sorted_token_comparisons),
|
98
|
+
token_comparisons: sorted_token_comparisons,
|
99
|
+
left_sequence: left,
|
100
|
+
right_sequence: right,
|
101
|
+
)
|
102
|
+
end
|
103
|
+
|
104
|
+
sig { params(comparison: AddressValidation::Token::Comparison).returns(T::Boolean) }
|
105
|
+
def tokens_match_by_edit_distance?(comparison:)
|
106
|
+
max_edit_distance = [comparison.left.value.length, comparison.right.value.length].max
|
107
|
+
edit_distance_percent = comparison.edit_distance.to_f / max_edit_distance
|
108
|
+
|
109
|
+
:prefix == comparison.qualifier || edit_distance_percent <= MAX_ALLOWED_EDIT_DISTANCE_PERCENT
|
110
|
+
end
|
111
|
+
|
112
|
+
sig do
|
113
|
+
params(
|
114
|
+
left_tokens: T::Array[Token],
|
115
|
+
right_tokens: T::Array[Token],
|
116
|
+
comparisons: T::Array[Token::Comparison],
|
117
|
+
).returns(T::Array[Token])
|
118
|
+
end
|
119
|
+
def unmatched_tokens(left_tokens, right_tokens, comparisons)
|
120
|
+
remaining_left_tokens = left_tokens.reject do |token|
|
121
|
+
comparisons.any? do |comparison|
|
122
|
+
same_token_or_position?(comparison.left, token)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
remaining_left_tokens = remove_synonyms_at_same_position(remaining_left_tokens)
|
127
|
+
|
128
|
+
remaining_right_tokens = right_tokens.reject do |token|
|
129
|
+
comparisons.any? do |comparison|
|
130
|
+
same_token_or_position?(comparison.right, token)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
remaining_right_tokens = remove_synonyms_at_same_position(remaining_right_tokens)
|
135
|
+
|
136
|
+
remaining_left_tokens.concat(remaining_right_tokens)
|
137
|
+
end
|
138
|
+
|
139
|
+
sig { params(token: Token, other_token: Token).returns(T::Boolean) }
|
140
|
+
def same_token_or_position?(token, other_token)
|
141
|
+
return true if token == other_token
|
142
|
+
|
143
|
+
token.offset_range == other_token.offset_range && token.position == other_token.position
|
144
|
+
end
|
145
|
+
|
146
|
+
sig { params(tokens: T::Array[Token]).returns(T::Array[Token]) }
|
147
|
+
def remove_synonyms_at_same_position(tokens)
|
148
|
+
tokens.group_by(&:position)
|
149
|
+
.each do |_, tokens|
|
150
|
+
tokens.reject! { |token| token.type == "SYNONYM" } if tokens.size > 1
|
151
|
+
end
|
152
|
+
.values.flatten
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
# typed: true
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module AtlasEngine
|
5
|
+
module AddressValidation
|
6
|
+
class Token
|
7
|
+
class Sequence
|
8
|
+
class Comparison
|
9
|
+
extend T::Sig
|
10
|
+
include Comparable
|
11
|
+
|
12
|
+
DEFAULT_PARTIAL_MATCH_THRESHOLD_PERCENT = 0.5
|
13
|
+
|
14
|
+
attr_reader :unmatched_tokens, :left_sequence, :right_sequence, :token_comparisons
|
15
|
+
|
16
|
+
sig do
|
17
|
+
params(
|
18
|
+
unmatched_tokens: T::Array[Token],
|
19
|
+
token_comparisons: T::Array[Token::Comparison],
|
20
|
+
left_sequence: T.nilable(Sequence),
|
21
|
+
right_sequence: T.nilable(Sequence),
|
22
|
+
).void
|
23
|
+
end
|
24
|
+
def initialize(unmatched_tokens:, token_comparisons:, left_sequence:, right_sequence:)
|
25
|
+
@unmatched_tokens = unmatched_tokens
|
26
|
+
@token_comparisons = token_comparisons
|
27
|
+
@left_sequence = left_sequence
|
28
|
+
@right_sequence = right_sequence
|
29
|
+
end
|
30
|
+
|
31
|
+
sig { params(other: Comparison).returns(Integer) }
|
32
|
+
def <=>(other)
|
33
|
+
# > num matches
|
34
|
+
# longest subsequence
|
35
|
+
# < num unmatched (kinda related to < aggregate edit distance)
|
36
|
+
# < aggregate edit distance
|
37
|
+
# > num prefixes
|
38
|
+
# > num suffixes
|
39
|
+
matches = count_by_qualifier(:equal) <=> other.count_by_qualifier(:equal)
|
40
|
+
return matches * -1 if matches.nonzero?
|
41
|
+
|
42
|
+
unmatched = unmatched_tokens.size <=> other.unmatched_tokens.size
|
43
|
+
return unmatched if unmatched.nonzero?
|
44
|
+
|
45
|
+
longest_subsequence = longest_subsequence_comparison <=> other.longest_subsequence_comparison
|
46
|
+
return -1 * longest_subsequence if longest_subsequence.nonzero?
|
47
|
+
|
48
|
+
edit_distance = aggregate_edit_distance <=> other.aggregate_edit_distance
|
49
|
+
return edit_distance if edit_distance.nonzero?
|
50
|
+
|
51
|
+
prefixes = count_by_qualifier(:prefix) <=> other.count_by_qualifier(:prefix)
|
52
|
+
return prefixes * -1 if prefixes.nonzero?
|
53
|
+
|
54
|
+
(count_by_qualifier(:suffix) <=> other.count_by_qualifier(:suffix)) * -1
|
55
|
+
end
|
56
|
+
|
57
|
+
sig { returns(String) }
|
58
|
+
def inspect
|
59
|
+
parts = ["["]
|
60
|
+
token_comparisons.each do |comparison|
|
61
|
+
parts << "\n#{comparison.inspect}"
|
62
|
+
end
|
63
|
+
parts << "\n" unless token_comparisons.empty?
|
64
|
+
parts << "]"
|
65
|
+
"<seqcomp unmatched:#{unmatched_tokens.inspect} comp:#{parts.join}/>"
|
66
|
+
end
|
67
|
+
|
68
|
+
sig { params(other_comparison: Comparison).returns(T::Boolean) }
|
69
|
+
def better_than?(other_comparison)
|
70
|
+
self < other_comparison
|
71
|
+
end
|
72
|
+
|
73
|
+
sig { params(other_comparison: Comparison).returns(T::Boolean) }
|
74
|
+
def worse_than?(other_comparison)
|
75
|
+
self > other_comparison
|
76
|
+
end
|
77
|
+
|
78
|
+
sig { params(other_comparison: Comparison).returns(T::Boolean) }
|
79
|
+
def equivalent_to?(other_comparison)
|
80
|
+
self == other_comparison
|
81
|
+
end
|
82
|
+
|
83
|
+
sig { params(other_comparison: Comparison).returns(Comparison) }
|
84
|
+
def merge(other_comparison)
|
85
|
+
AddressValidation::Token::Sequence::Comparison.new(
|
86
|
+
unmatched_tokens: unmatched_tokens + other_comparison.unmatched_tokens,
|
87
|
+
token_comparisons: (token_comparisons + other_comparison.token_comparisons).uniq,
|
88
|
+
left_sequence: left_sequence.equal?(other_comparison.left_sequence) ? left_sequence : nil,
|
89
|
+
right_sequence: right_sequence.equal?(other_comparison.right_sequence) ? right_sequence : nil,
|
90
|
+
)
|
91
|
+
end
|
92
|
+
|
93
|
+
sig { returns(T::Boolean) }
|
94
|
+
def match?
|
95
|
+
aggregate_edit_distance == 0 && unmatched_tokens.empty?
|
96
|
+
end
|
97
|
+
|
98
|
+
sig { params(threshold_percent: Float).returns(T::Boolean) }
|
99
|
+
def potential_match?(threshold_percent: DEFAULT_PARTIAL_MATCH_THRESHOLD_PERCENT)
|
100
|
+
matched_tokens_percent >= threshold_percent && matched_length_percent >= threshold_percent
|
101
|
+
end
|
102
|
+
|
103
|
+
sig { returns(Integer) }
|
104
|
+
def aggregate_edit_distance
|
105
|
+
token_comparisons.sum(&:edit_distance)
|
106
|
+
end
|
107
|
+
|
108
|
+
sig { returns(Integer) }
|
109
|
+
def token_match_count
|
110
|
+
token_comparisons.size
|
111
|
+
end
|
112
|
+
|
113
|
+
protected
|
114
|
+
|
115
|
+
sig { params(qualifier: Symbol).returns(Integer) }
|
116
|
+
def count_by_qualifier(qualifier)
|
117
|
+
token_comparisons.count { |comparison| comparison.qualifier == qualifier }
|
118
|
+
end
|
119
|
+
|
120
|
+
sig { returns([Integer, Integer]) }
|
121
|
+
def longest_subsequence_comparison
|
122
|
+
max_subsequence_length = subsequence_lengths.max || 0
|
123
|
+
# max length, number of times we saw a subsequence of max length (acts as a tiebreaker)
|
124
|
+
[max_subsequence_length, subsequence_lengths.count(max_subsequence_length)]
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
sig { returns(T::Array[Integer]) }
|
130
|
+
def subsequence_lengths
|
131
|
+
# measure length of consecutive pairs of equal tokens. The position of both compared tokens
|
132
|
+
# must increase by 1 relative to the preceeding AddressValidation::Token::Comparison's pair.
|
133
|
+
@subsequence_lengths = equal_token_comparisons
|
134
|
+
.chunk_while { |token_comp, next_token_comp| token_comp.preceeds?(next_token_comp) }
|
135
|
+
.map(&:length)
|
136
|
+
.select { |length| length > 1 } # trivial sequences of length 1 are ignored
|
137
|
+
end
|
138
|
+
|
139
|
+
sig { returns(T::Array[Token::Comparison]) }
|
140
|
+
def equal_token_comparisons
|
141
|
+
token_comparisons.select(&:equal?)
|
142
|
+
end
|
143
|
+
|
144
|
+
sig { returns(Float) }
|
145
|
+
def matched_tokens_percent
|
146
|
+
matched_tokens_count = token_comparisons.size * 2
|
147
|
+
unmatched_tokens_count = unmatched_tokens.size
|
148
|
+
(matched_tokens_count.to_f / (matched_tokens_count + unmatched_tokens_count)).round(2)
|
149
|
+
end
|
150
|
+
|
151
|
+
sig { returns(Float) }
|
152
|
+
def matched_length_percent
|
153
|
+
matched_length = token_comparisons.sum do |token_pair|
|
154
|
+
token_pair.left.value.length + token_pair.right.value.length - token_pair.edit_distance
|
155
|
+
end
|
156
|
+
total_edit_distance = token_comparisons.sum(&:edit_distance)
|
157
|
+
unmatched_length = unmatched_tokens.sum do |token|
|
158
|
+
token.value.length
|
159
|
+
end
|
160
|
+
(matched_length.to_f / (matched_length + unmatched_length + total_edit_distance)).round(2)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
# typed: true
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module AtlasEngine
|
5
|
+
module AddressValidation
|
6
|
+
class Token
|
7
|
+
class Sequence
|
8
|
+
extend T::Sig
|
9
|
+
|
10
|
+
class << self
|
11
|
+
extend T::Sig
|
12
|
+
include Normalizer
|
13
|
+
ACCEPTABLE_CHARACTERS = /\p{Alnum}/
|
14
|
+
|
15
|
+
sig { params(string: T.nilable(String)).returns(Sequence) }
|
16
|
+
def from_string(string)
|
17
|
+
start_offset = 0
|
18
|
+
end_offset = 0
|
19
|
+
position = 0
|
20
|
+
|
21
|
+
tokens = Annex29.segment_words(string).filter_map do |substring|
|
22
|
+
start_offset = end_offset
|
23
|
+
end_offset = start_offset + substring.length
|
24
|
+
|
25
|
+
normalized_substring = normalize(substring)
|
26
|
+
# annex 29 returns whitespace and punctuation as separate substrings
|
27
|
+
next unless normalized_substring.match?(ACCEPTABLE_CHARACTERS)
|
28
|
+
|
29
|
+
token = Token.new(
|
30
|
+
value: normalized_substring,
|
31
|
+
start_offset: start_offset,
|
32
|
+
end_offset: end_offset,
|
33
|
+
position: position,
|
34
|
+
type: number?(substring) ? "<NUM>" : "<ALPHANUM>",
|
35
|
+
)
|
36
|
+
|
37
|
+
position += 1
|
38
|
+
|
39
|
+
token
|
40
|
+
end
|
41
|
+
|
42
|
+
new(tokens: tokens, raw_value: string)
|
43
|
+
end
|
44
|
+
|
45
|
+
def number?(string)
|
46
|
+
!Float(string).nil?
|
47
|
+
rescue
|
48
|
+
false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
TokenOrSynonyms = T.type_alias { T.any(Token, Synonyms) }
|
53
|
+
|
54
|
+
sig { returns(T::Array[TokenOrSynonyms]) }
|
55
|
+
attr_reader :tokens
|
56
|
+
|
57
|
+
sig { returns(T.nilable(String)) }
|
58
|
+
attr_reader :raw_value
|
59
|
+
|
60
|
+
# Sorbet can't handle delegates https://github.com/sorbet/sorbet/issues/4794
|
61
|
+
# rubocop:disable Rails/Delegate
|
62
|
+
sig { returns(T::Boolean) }
|
63
|
+
def empty? = tokens.empty?
|
64
|
+
|
65
|
+
sig { returns(Integer) }
|
66
|
+
def size = tokens.size
|
67
|
+
|
68
|
+
sig { returns(Integer) }
|
69
|
+
def length = tokens.length
|
70
|
+
# rubocop:enable Rails/Delegate
|
71
|
+
|
72
|
+
sig { params(tokens: T::Array[Token], raw_value: T.nilable(String)).void }
|
73
|
+
def initialize(tokens: [], raw_value: nil)
|
74
|
+
@raw_value = raw_value
|
75
|
+
@tokens = group_by_overlapping_offsets(tokens)
|
76
|
+
.map { |tkns| tkns.one? ? T.must(tkns.first) : Synonyms.new(tokens: tkns) }
|
77
|
+
end
|
78
|
+
|
79
|
+
sig { returns(String) }
|
80
|
+
def inspect
|
81
|
+
"<seq #{tokens.inspect}/>"
|
82
|
+
end
|
83
|
+
|
84
|
+
sig { returns(T::Array[T::Array[Token]]) }
|
85
|
+
def permutations = recursive_permutations(tokens)
|
86
|
+
|
87
|
+
def ==(other)
|
88
|
+
return false unless other.is_a?(Sequence)
|
89
|
+
|
90
|
+
tokens == other.tokens
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
sig { params(token_array: T::Array[TokenOrSynonyms]).returns(T::Array[T::Array[Token]]) }
|
96
|
+
def recursive_permutations(token_array)
|
97
|
+
# we bottom out when token_array contains only simple tokens
|
98
|
+
next_synonyms_index = token_array.find_index { |entry| entry.is_a?(Synonyms) }
|
99
|
+
# There are no synonyms in that array, cast is safe
|
100
|
+
return [T.cast(token_array, T::Array[Token])] unless next_synonyms_index
|
101
|
+
|
102
|
+
new_tokens = token_array.dup
|
103
|
+
synonyms = T.cast(new_tokens[next_synonyms_index], Synonyms)
|
104
|
+
new_tokens.delete_at(next_synonyms_index)
|
105
|
+
|
106
|
+
if synonyms.multi_token?
|
107
|
+
# token_array (before synonyms object was deleted): [a, b, <syn [afb, [air, force, base]]/>, ...rest]
|
108
|
+
# output: [[a, b, afb, ...rest], [a, b, air, force, base, ...rest]]
|
109
|
+
synonyms.tokens.flat_map do |multi_token_entry|
|
110
|
+
current_permutation = T.unsafe(new_tokens).dup.insert(next_synonyms_index, *Array(multi_token_entry))
|
111
|
+
# ...rest will be handled recursively
|
112
|
+
recursive_permutations(current_permutation)
|
113
|
+
end
|
114
|
+
else
|
115
|
+
# token_array (before synonyms object was deleted): [a, b, <syn [st, street, saint/>, ...rest]
|
116
|
+
# output: [[a, b, st, street, saint, ...rest]]
|
117
|
+
T.unsafe(new_tokens).insert(next_synonyms_index, *synonyms.tokens)
|
118
|
+
# ...rest will be handled recursively
|
119
|
+
recursive_permutations(new_tokens)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
sig { params(tokens: T::Array[Token]).returns(T::Array[T::Array[Token]]) }
|
124
|
+
def group_by_overlapping_offsets(tokens)
|
125
|
+
return [] if tokens.empty?
|
126
|
+
|
127
|
+
sorted_tokens = tokens.stable_sort_by(&:position)
|
128
|
+
current_range = sorted_tokens.first&.offset_range
|
129
|
+
|
130
|
+
groups = []
|
131
|
+
current_group = []
|
132
|
+
sorted_tokens.each do |token|
|
133
|
+
if current_range.cover?(token.offset_range)
|
134
|
+
current_group << token
|
135
|
+
else
|
136
|
+
groups << current_group
|
137
|
+
current_group = [token]
|
138
|
+
current_range = token.offset_range
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
groups << current_group
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# typed: true
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module AtlasEngine
|
5
|
+
module AddressValidation
|
6
|
+
class Token
|
7
|
+
class Synonyms
|
8
|
+
extend T::Sig
|
9
|
+
# Similar to a Token, and has some of the same methods like position, value, type.
|
10
|
+
|
11
|
+
TokenList = T.type_alias { T::Array[Token] }
|
12
|
+
|
13
|
+
sig { returns(T::Array[T.any(Token, TokenList)]) }
|
14
|
+
attr_reader :tokens
|
15
|
+
|
16
|
+
# Sorbet can't handle delegates https://github.com/sorbet/sorbet/issues/4794
|
17
|
+
# rubocop:disable Rails/Delegate
|
18
|
+
sig { returns(Integer) }
|
19
|
+
def position = first_token.position
|
20
|
+
|
21
|
+
sig { returns(T::Range[Integer]) }
|
22
|
+
def offset_range = first_token.offset_range
|
23
|
+
# rubocop:enable Rails/Delegate
|
24
|
+
|
25
|
+
sig { params(tokens: T::Array[Token]).void }
|
26
|
+
def initialize(tokens: [])
|
27
|
+
raise ArgumentError, "Synonyms cannot be empty" if tokens.empty?
|
28
|
+
|
29
|
+
@tokens = []
|
30
|
+
tokens_by_position = tokens.stable_sort_by(&:position).group_by(&:position)
|
31
|
+
|
32
|
+
while tokens_by_position.values.any?(&:present?)
|
33
|
+
current_group = []
|
34
|
+
starting_position = tokens_by_position.keys.first
|
35
|
+
|
36
|
+
while tokens_by_position.key?(starting_position)
|
37
|
+
token = T.must(tokens_by_position[starting_position]).shift
|
38
|
+
current_group << token
|
39
|
+
starting_position += T.must(token).position_length
|
40
|
+
end
|
41
|
+
|
42
|
+
@tokens << (current_group.one? ? current_group.first : current_group)
|
43
|
+
tokens_by_position.compact_blank! # remove positions having no tokens
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
sig { returns(String) }
|
48
|
+
def inspect
|
49
|
+
"<syn #{tokens.inspect}/>"
|
50
|
+
end
|
51
|
+
|
52
|
+
sig { returns(NilClass) }
|
53
|
+
def value
|
54
|
+
nil
|
55
|
+
end
|
56
|
+
|
57
|
+
sig { returns(String) }
|
58
|
+
def type
|
59
|
+
"<SYNONYMS>"
|
60
|
+
end
|
61
|
+
|
62
|
+
sig { returns(T::Boolean) }
|
63
|
+
def multi_token?
|
64
|
+
tokens.any?(Array)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
sig { returns(Token) }
|
70
|
+
def first_token
|
71
|
+
head = T.must(tokens.first)
|
72
|
+
head.is_a?(Array) ? T.must(head.first) : head
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|