dsp-tools 9.1.0.post11__py3-none-any.whl → 18.3.0.post13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsp_tools/__init__.py +4 -0
- dsp_tools/cli/args.py +36 -0
- dsp_tools/cli/call_action.py +51 -231
- dsp_tools/cli/call_action_files_only.py +101 -0
- dsp_tools/cli/call_action_with_network.py +207 -0
- dsp_tools/cli/create_parsers.py +156 -58
- dsp_tools/cli/entry_point.py +56 -26
- dsp_tools/cli/utils.py +87 -0
- dsp_tools/clients/CLAUDE.md +420 -0
- dsp_tools/clients/authentication_client.py +14 -0
- dsp_tools/clients/authentication_client_live.py +66 -0
- dsp_tools/{utils → clients}/connection.py +2 -18
- dsp_tools/clients/connection_live.py +233 -0
- dsp_tools/clients/fuseki_metrics.py +60 -0
- dsp_tools/clients/group_user_clients.py +35 -0
- dsp_tools/clients/group_user_clients_live.py +181 -0
- dsp_tools/clients/legal_info_client.py +23 -0
- dsp_tools/clients/legal_info_client_live.py +132 -0
- dsp_tools/clients/list_client.py +49 -0
- dsp_tools/clients/list_client_live.py +166 -0
- dsp_tools/clients/metadata_client.py +24 -0
- dsp_tools/clients/metadata_client_live.py +47 -0
- dsp_tools/clients/ontology_clients.py +49 -0
- dsp_tools/clients/ontology_create_client_live.py +166 -0
- dsp_tools/clients/ontology_get_client_live.py +80 -0
- dsp_tools/clients/permissions_client.py +68 -0
- dsp_tools/clients/project_client.py +16 -0
- dsp_tools/clients/project_client_live.py +66 -0
- dsp_tools/commands/create/communicate_problems.py +24 -0
- dsp_tools/commands/create/create.py +134 -0
- dsp_tools/commands/create/create_on_server/cardinalities.py +111 -0
- dsp_tools/commands/create/create_on_server/classes.py +99 -0
- dsp_tools/commands/create/create_on_server/complete_ontologies.py +116 -0
- dsp_tools/commands/create/create_on_server/default_permissions.py +134 -0
- dsp_tools/commands/create/create_on_server/group_users.py +165 -0
- dsp_tools/commands/create/create_on_server/lists.py +163 -0
- dsp_tools/commands/create/create_on_server/mappers.py +12 -0
- dsp_tools/commands/create/create_on_server/onto_utils.py +74 -0
- dsp_tools/commands/create/create_on_server/ontology.py +52 -0
- dsp_tools/commands/create/create_on_server/project.py +68 -0
- dsp_tools/commands/create/create_on_server/properties.py +119 -0
- dsp_tools/commands/create/exceptions.py +29 -0
- dsp_tools/commands/create/lists_only.py +66 -0
- dsp_tools/commands/create/models/create_problems.py +87 -0
- dsp_tools/commands/create/models/parsed_ontology.py +88 -0
- dsp_tools/commands/create/models/parsed_project.py +81 -0
- dsp_tools/commands/create/models/rdf_ontology.py +12 -0
- dsp_tools/commands/create/models/server_project_info.py +100 -0
- dsp_tools/commands/create/parsing/parse_lists.py +45 -0
- dsp_tools/commands/create/parsing/parse_ontology.py +243 -0
- dsp_tools/commands/create/parsing/parse_project.py +149 -0
- dsp_tools/commands/create/parsing/parsing_utils.py +40 -0
- dsp_tools/commands/create/project_validate.py +595 -0
- dsp_tools/commands/create/serialisation/ontology.py +119 -0
- dsp_tools/commands/create/serialisation/project.py +44 -0
- dsp_tools/commands/excel2json/CLAUDE.md +101 -0
- dsp_tools/commands/excel2json/json_header.py +57 -23
- dsp_tools/commands/excel2json/{new_lists → lists}/compliance_checks.py +26 -26
- dsp_tools/commands/excel2json/{new_lists/make_new_lists.py → lists/make_lists.py} +19 -18
- dsp_tools/commands/excel2json/{new_lists → lists}/models/input_error.py +1 -12
- dsp_tools/commands/excel2json/{new_lists → lists}/models/serialise.py +9 -5
- dsp_tools/commands/excel2json/{new_lists → lists}/utils.py +4 -4
- dsp_tools/commands/excel2json/models/input_error.py +31 -11
- dsp_tools/commands/excel2json/models/json_header.py +53 -15
- dsp_tools/commands/excel2json/models/ontology.py +4 -3
- dsp_tools/commands/excel2json/{lists.py → old_lists.py} +26 -112
- dsp_tools/commands/excel2json/project.py +78 -34
- dsp_tools/commands/excel2json/properties.py +57 -36
- dsp_tools/commands/excel2json/resources.py +32 -12
- dsp_tools/commands/excel2json/utils.py +20 -1
- dsp_tools/commands/excel2xml/__init__.py +2 -2
- dsp_tools/commands/excel2xml/excel2xml_cli.py +7 -15
- dsp_tools/commands/excel2xml/excel2xml_lib.py +138 -493
- dsp_tools/commands/excel2xml/propertyelement.py +5 -5
- dsp_tools/commands/{project → get}/get.py +29 -13
- dsp_tools/commands/get/get_permissions.py +257 -0
- dsp_tools/commands/get/get_permissions_legacy.py +89 -0
- dsp_tools/commands/{project/models → get/legacy_models}/context.py +6 -6
- dsp_tools/commands/{project/models → get/legacy_models}/group.py +5 -10
- dsp_tools/commands/{project/models → get/legacy_models}/listnode.py +5 -35
- dsp_tools/commands/{project/models → get/legacy_models}/model.py +1 -1
- dsp_tools/commands/{project/models → get/legacy_models}/ontology.py +9 -14
- dsp_tools/commands/{project/models → get/legacy_models}/project.py +13 -6
- dsp_tools/commands/{project/models → get/legacy_models}/propertyclass.py +9 -16
- dsp_tools/commands/{project/models → get/legacy_models}/resourceclass.py +8 -46
- dsp_tools/commands/{project/models → get/legacy_models}/user.py +19 -60
- dsp_tools/commands/get/models/permissions_models.py +10 -0
- dsp_tools/commands/id2iri.py +20 -10
- dsp_tools/commands/ingest_xmlupload/bulk_ingest_client.py +81 -56
- dsp_tools/commands/ingest_xmlupload/create_resources/apply_ingest_id.py +4 -10
- dsp_tools/commands/ingest_xmlupload/create_resources/upload_xml.py +97 -37
- dsp_tools/commands/ingest_xmlupload/create_resources/user_information.py +2 -2
- dsp_tools/commands/ingest_xmlupload/ingest_files/ingest_files.py +9 -10
- dsp_tools/commands/ingest_xmlupload/upload_files/filechecker.py +3 -3
- dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py +2 -10
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_failures.py +12 -2
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_files.py +8 -9
- dsp_tools/commands/resume_xmlupload/resume_xmlupload.py +18 -18
- dsp_tools/commands/start_stack.py +126 -77
- dsp_tools/commands/update_legal/CLAUDE.md +344 -0
- dsp_tools/commands/update_legal/__init__.py +0 -0
- dsp_tools/commands/update_legal/core.py +182 -0
- dsp_tools/commands/update_legal/csv_operations.py +135 -0
- dsp_tools/commands/update_legal/models.py +87 -0
- dsp_tools/commands/update_legal/xml_operations.py +247 -0
- dsp_tools/commands/validate_data/CLAUDE.md +159 -0
- dsp_tools/commands/validate_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/constants.py +59 -0
- dsp_tools/commands/validate_data/mappers.py +143 -0
- dsp_tools/commands/validate_data/models/__init__.py +0 -0
- dsp_tools/commands/validate_data/models/api_responses.py +45 -0
- dsp_tools/commands/validate_data/models/input_problems.py +119 -0
- dsp_tools/commands/validate_data/models/rdf_like_data.py +117 -0
- dsp_tools/commands/validate_data/models/validation.py +106 -0
- dsp_tools/commands/validate_data/prepare_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/prepare_data/get_rdf_like_data.py +296 -0
- dsp_tools/commands/validate_data/prepare_data/make_data_graph.py +91 -0
- dsp_tools/commands/validate_data/prepare_data/prepare_data.py +184 -0
- dsp_tools/commands/validate_data/process_validation_report/__init__.py +0 -0
- dsp_tools/commands/validate_data/process_validation_report/get_user_validation_message.py +358 -0
- dsp_tools/commands/validate_data/process_validation_report/query_validation_result.py +507 -0
- dsp_tools/commands/validate_data/process_validation_report/reformat_validation_results.py +150 -0
- dsp_tools/commands/validate_data/shacl_cli_validator.py +70 -0
- dsp_tools/commands/validate_data/sparql/__init__.py +0 -0
- dsp_tools/commands/{xml_validate/sparql/resource_shacl.py → validate_data/sparql/cardinality_shacl.py} +45 -47
- dsp_tools/commands/validate_data/sparql/construct_shacl.py +92 -0
- dsp_tools/commands/validate_data/sparql/legal_info_shacl.py +36 -0
- dsp_tools/commands/validate_data/sparql/value_shacl.py +357 -0
- dsp_tools/commands/validate_data/utils.py +59 -0
- dsp_tools/commands/validate_data/validate_data.py +283 -0
- dsp_tools/commands/validate_data/validation/__init__.py +0 -0
- dsp_tools/commands/validate_data/validation/check_duplicate_files.py +55 -0
- dsp_tools/commands/validate_data/validation/check_for_unknown_classes.py +67 -0
- dsp_tools/commands/validate_data/validation/get_validation_report.py +94 -0
- dsp_tools/commands/validate_data/validation/validate_ontology.py +107 -0
- dsp_tools/commands/xmlupload/CLAUDE.md +292 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/__init__.py +0 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/constants.py +63 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/jsonld_utils.py +44 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_file_value.py +77 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_resource_and_values.py +114 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_values.py +262 -0
- dsp_tools/commands/xmlupload/models/bitstream_info.py +18 -0
- dsp_tools/commands/xmlupload/models/formatted_text_value.py +0 -25
- dsp_tools/commands/xmlupload/models/ingest.py +56 -70
- dsp_tools/commands/xmlupload/models/input_problems.py +6 -14
- dsp_tools/commands/xmlupload/models/lookup_models.py +21 -0
- dsp_tools/commands/xmlupload/models/permission.py +0 -39
- dsp_tools/commands/xmlupload/models/{deserialise/xmlpermission.py → permissions_parsed.py} +2 -2
- dsp_tools/commands/xmlupload/models/processed/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/processed/file_values.py +29 -0
- dsp_tools/commands/xmlupload/models/processed/res.py +27 -0
- dsp_tools/commands/xmlupload/models/processed/values.py +101 -0
- dsp_tools/commands/xmlupload/models/rdf_models.py +26 -0
- dsp_tools/commands/xmlupload/models/upload_clients.py +3 -3
- dsp_tools/commands/xmlupload/models/upload_state.py +2 -4
- dsp_tools/commands/xmlupload/prepare_xml_input/__init__.py +0 -0
- dsp_tools/commands/xmlupload/{ark2iri.py → prepare_xml_input/ark2iri.py} +1 -1
- dsp_tools/commands/xmlupload/prepare_xml_input/get_processed_resources.py +252 -0
- dsp_tools/commands/xmlupload/{iiif_uri_validator.py → prepare_xml_input/iiif_uri_validator.py} +2 -14
- dsp_tools/commands/xmlupload/{list_client.py → prepare_xml_input/list_client.py} +15 -10
- dsp_tools/commands/xmlupload/prepare_xml_input/prepare_xml_input.py +67 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/read_validate_xml_file.py +58 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/transform_input_values.py +118 -0
- dsp_tools/commands/xmlupload/resource_create_client.py +7 -468
- dsp_tools/commands/xmlupload/richtext_id2iri.py +37 -0
- dsp_tools/commands/xmlupload/stash/{construct_and_analyze_graph.py → analyse_circular_reference_graph.py} +64 -157
- dsp_tools/commands/xmlupload/stash/create_info_for_graph.py +53 -0
- dsp_tools/commands/xmlupload/stash/graph_models.py +13 -8
- dsp_tools/commands/xmlupload/stash/stash_circular_references.py +48 -115
- dsp_tools/commands/xmlupload/stash/stash_models.py +4 -9
- dsp_tools/commands/xmlupload/stash/upload_stashed_resptr_props.py +34 -40
- dsp_tools/commands/xmlupload/stash/upload_stashed_xml_texts.py +98 -108
- dsp_tools/commands/xmlupload/upload_config.py +8 -0
- dsp_tools/commands/xmlupload/write_diagnostic_info.py +14 -9
- dsp_tools/commands/xmlupload/xmlupload.py +214 -192
- dsp_tools/config/__init__.py +0 -0
- dsp_tools/config/logger_config.py +69 -0
- dsp_tools/{utils → config}/warnings_config.py +4 -1
- dsp_tools/error/__init__.py +0 -0
- dsp_tools/error/custom_warnings.py +39 -0
- dsp_tools/error/exceptions.py +204 -0
- dsp_tools/error/problems.py +10 -0
- dsp_tools/error/xmllib_errors.py +20 -0
- dsp_tools/error/xmllib_warnings.py +54 -0
- dsp_tools/error/xmllib_warnings_util.py +159 -0
- dsp_tools/error/xsd_validation_error_msg.py +19 -0
- dsp_tools/legacy_models/__init__.py +0 -0
- dsp_tools/{models → legacy_models}/datetimestamp.py +7 -7
- dsp_tools/{models → legacy_models}/langstring.py +1 -1
- dsp_tools/{models → legacy_models}/projectContext.py +4 -4
- dsp_tools/resources/schema/data.xsd +108 -83
- dsp_tools/resources/schema/lists-only.json +4 -23
- dsp_tools/resources/schema/project.json +80 -35
- dsp_tools/resources/schema/properties-only.json +1 -4
- dsp_tools/resources/start-stack/docker-compose.override-host.j2 +11 -0
- dsp_tools/resources/start-stack/docker-compose.yml +34 -30
- dsp_tools/resources/start-stack/dsp-app-config.json +45 -0
- dsp_tools/resources/start-stack/dsp-app-config.override-host.j2 +26 -0
- dsp_tools/resources/validate_data/api-shapes-resource-cardinalities.ttl +191 -0
- dsp_tools/resources/validate_data/api-shapes.ttl +804 -0
- dsp_tools/resources/validate_data/shacl-cli-image.yml +4 -0
- dsp_tools/resources/validate_data/validate-ontology.ttl +99 -0
- dsp_tools/utils/ansi_colors.py +32 -0
- dsp_tools/utils/data_formats/__init__.py +0 -0
- dsp_tools/utils/{date_util.py → data_formats/date_util.py} +13 -1
- dsp_tools/utils/data_formats/iri_util.py +30 -0
- dsp_tools/utils/{shared.py → data_formats/shared.py} +1 -35
- dsp_tools/utils/{uri_util.py → data_formats/uri_util.py} +12 -2
- dsp_tools/utils/fuseki_bloating.py +63 -0
- dsp_tools/utils/json_parsing.py +22 -0
- dsp_tools/utils/rdf_constants.py +42 -0
- dsp_tools/utils/rdflib_utils.py +10 -0
- dsp_tools/utils/replace_id_with_iri.py +66 -0
- dsp_tools/utils/request_utils.py +238 -0
- dsp_tools/utils/xml_parsing/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/get_lookups.py +32 -0
- dsp_tools/utils/xml_parsing/get_parsed_resources.py +325 -0
- dsp_tools/utils/xml_parsing/models/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/models/parsed_resource.py +76 -0
- dsp_tools/utils/xml_parsing/parse_clean_validate_xml.py +137 -0
- dsp_tools/xmllib/CLAUDE.md +302 -0
- dsp_tools/xmllib/__init__.py +49 -0
- dsp_tools/xmllib/general_functions.py +877 -0
- dsp_tools/xmllib/internal/__init__.py +0 -0
- dsp_tools/xmllib/internal/checkers.py +162 -0
- dsp_tools/xmllib/internal/circumvent_circular_imports.py +36 -0
- dsp_tools/xmllib/internal/constants.py +46 -0
- dsp_tools/xmllib/internal/input_converters.py +155 -0
- dsp_tools/xmllib/internal/serialise_file_value.py +57 -0
- dsp_tools/xmllib/internal/serialise_resource.py +177 -0
- dsp_tools/xmllib/internal/serialise_values.py +152 -0
- dsp_tools/xmllib/internal/type_aliases.py +11 -0
- dsp_tools/xmllib/models/config_options.py +28 -0
- dsp_tools/xmllib/models/date_formats.py +48 -0
- dsp_tools/xmllib/models/dsp_base_resources.py +1380 -400
- dsp_tools/xmllib/models/internal/__init__.py +0 -0
- dsp_tools/xmllib/models/internal/file_values.py +172 -0
- dsp_tools/xmllib/models/internal/geometry.py +162 -0
- dsp_tools/xmllib/models/{migration_metadata.py → internal/migration_metadata.py} +14 -10
- dsp_tools/xmllib/models/internal/serialise_permissions.py +66 -0
- dsp_tools/xmllib/models/internal/values.py +342 -0
- dsp_tools/xmllib/models/licenses/__init__.py +0 -0
- dsp_tools/xmllib/models/licenses/other.py +59 -0
- dsp_tools/xmllib/models/licenses/recommended.py +107 -0
- dsp_tools/xmllib/models/permissions.py +41 -0
- dsp_tools/xmllib/models/res.py +1782 -0
- dsp_tools/xmllib/models/root.py +313 -26
- dsp_tools/xmllib/value_checkers.py +310 -47
- dsp_tools/xmllib/value_converters.py +765 -8
- dsp_tools-18.3.0.post13.dist-info/METADATA +90 -0
- dsp_tools-18.3.0.post13.dist-info/RECORD +286 -0
- dsp_tools-18.3.0.post13.dist-info/WHEEL +4 -0
- {dsp_tools-9.1.0.post11.dist-info → dsp_tools-18.3.0.post13.dist-info}/entry_points.txt +1 -0
- dsp_tools/commands/project/create/project_create.py +0 -1107
- dsp_tools/commands/project/create/project_create_lists.py +0 -204
- dsp_tools/commands/project/create/project_validate.py +0 -453
- dsp_tools/commands/project/models/project_definition.py +0 -12
- dsp_tools/commands/rosetta.py +0 -124
- dsp_tools/commands/template.py +0 -30
- dsp_tools/commands/xml_validate/api_connection.py +0 -122
- dsp_tools/commands/xml_validate/deserialise_input.py +0 -135
- dsp_tools/commands/xml_validate/make_data_rdf.py +0 -193
- dsp_tools/commands/xml_validate/models/data_deserialised.py +0 -108
- dsp_tools/commands/xml_validate/models/data_rdf.py +0 -214
- dsp_tools/commands/xml_validate/models/input_problems.py +0 -191
- dsp_tools/commands/xml_validate/models/validation.py +0 -29
- dsp_tools/commands/xml_validate/reformat_validaton_result.py +0 -89
- dsp_tools/commands/xml_validate/sparql/construct_shapes.py +0 -16
- dsp_tools/commands/xml_validate/xml_validate.py +0 -151
- dsp_tools/commands/xmlupload/check_consistency_with_ontology.py +0 -253
- dsp_tools/commands/xmlupload/models/deserialise/deserialise_value.py +0 -236
- dsp_tools/commands/xmlupload/models/deserialise/xmlresource.py +0 -171
- dsp_tools/commands/xmlupload/models/namespace_context.py +0 -39
- dsp_tools/commands/xmlupload/models/ontology_lookup_models.py +0 -161
- dsp_tools/commands/xmlupload/models/ontology_problem_models.py +0 -178
- dsp_tools/commands/xmlupload/models/serialise/jsonld_serialiser.py +0 -40
- dsp_tools/commands/xmlupload/models/serialise/serialise_value.py +0 -51
- dsp_tools/commands/xmlupload/ontology_client.py +0 -92
- dsp_tools/commands/xmlupload/project_client.py +0 -91
- dsp_tools/commands/xmlupload/read_validate_xml_file.py +0 -99
- dsp_tools/models/custom_warnings.py +0 -31
- dsp_tools/models/exceptions.py +0 -90
- dsp_tools/resources/0100-template-repo/template.json +0 -45
- dsp_tools/resources/0100-template-repo/template.xml +0 -27
- dsp_tools/resources/start-stack/docker-compose-validation.yml +0 -5
- dsp_tools/resources/start-stack/start-stack-config.yml +0 -4
- dsp_tools/resources/xml_validate/api-shapes.ttl +0 -411
- dsp_tools/resources/xml_validate/replace_namespace.xslt +0 -61
- dsp_tools/utils/connection_live.py +0 -383
- dsp_tools/utils/iri_util.py +0 -14
- dsp_tools/utils/logger_config.py +0 -41
- dsp_tools/utils/set_encoder.py +0 -20
- dsp_tools/utils/xml_utils.py +0 -145
- dsp_tools/utils/xml_validation.py +0 -197
- dsp_tools/utils/xml_validation_models.py +0 -68
- dsp_tools/xmllib/models/file_values.py +0 -78
- dsp_tools/xmllib/models/resource.py +0 -415
- dsp_tools/xmllib/models/values.py +0 -428
- dsp_tools-9.1.0.post11.dist-info/METADATA +0 -130
- dsp_tools-9.1.0.post11.dist-info/RECORD +0 -167
- dsp_tools-9.1.0.post11.dist-info/WHEEL +0 -4
- dsp_tools-9.1.0.post11.dist-info/licenses/LICENSE +0 -674
- /dsp_tools/{commands/excel2json/new_lists → clients}/__init__.py +0 -0
- /dsp_tools/commands/{excel2json/new_lists/models → create}/__init__.py +0 -0
- /dsp_tools/commands/{project → create/create_on_server}/__init__.py +0 -0
- /dsp_tools/commands/{project/create → create/models}/__init__.py +0 -0
- /dsp_tools/commands/{project/models → create/parsing}/__init__.py +0 -0
- /dsp_tools/commands/{xml_validate → create/serialisation}/__init__.py +0 -0
- /dsp_tools/commands/{xml_validate/models → excel2json/lists}/__init__.py +0 -0
- /dsp_tools/commands/{xml_validate/sparql → excel2json/lists/models}/__init__.py +0 -0
- /dsp_tools/commands/excel2json/{new_lists → lists}/models/deserialise.py +0 -0
- /dsp_tools/commands/{xmlupload/models/deserialise → get}/__init__.py +0 -0
- /dsp_tools/commands/{xmlupload/models/serialise → get/legacy_models}/__init__.py +0 -0
- /dsp_tools/commands/{project/models → get/legacy_models}/helpers.py +0 -0
- /dsp_tools/{models → commands/get/models}/__init__.py +0 -0
|
@@ -1,20 +1,777 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
1
4
|
from typing import Any
|
|
2
5
|
|
|
6
|
+
import regex
|
|
7
|
+
from regex import Match
|
|
8
|
+
|
|
9
|
+
from dsp_tools.error.xmllib_warnings import MessageInfo
|
|
10
|
+
from dsp_tools.error.xmllib_warnings_util import emit_xmllib_input_warning
|
|
11
|
+
from dsp_tools.error.xmllib_warnings_util import raise_xmllib_input_error
|
|
12
|
+
from dsp_tools.xmllib.internal.checkers import is_date_internal
|
|
13
|
+
from dsp_tools.xmllib.internal.checkers import is_nonempty_value_internal
|
|
14
|
+
from dsp_tools.xmllib.models.config_options import NewlineReplacement
|
|
15
|
+
from dsp_tools.xmllib.models.date_formats import Calendar
|
|
16
|
+
from dsp_tools.xmllib.models.date_formats import DateFormat
|
|
17
|
+
from dsp_tools.xmllib.models.date_formats import Era
|
|
18
|
+
|
|
3
19
|
|
|
4
|
-
def convert_to_bool_string(value: Any) ->
|
|
20
|
+
def convert_to_bool_string(value: Any) -> bool:
|
|
5
21
|
"""
|
|
6
22
|
Turns a value into a bool string, suitable for an XML.
|
|
23
|
+
It is case-insensitive, meaning that the words can also be capitalised.
|
|
24
|
+
|
|
25
|
+
Accepted values:
|
|
26
|
+
- `false`, `0`, `0.0`, `no`, `non`, `nein` -> `False`
|
|
27
|
+
- `true`, `1`, `1.0`, `yes`, `oui`, `ja`, `sì` -> `True`
|
|
7
28
|
|
|
8
29
|
Args:
|
|
9
30
|
value: value to transform
|
|
10
31
|
|
|
11
32
|
Returns:
|
|
12
|
-
|
|
13
|
-
|
|
33
|
+
`True` or `False` if it is an accepted value.
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
XmllibInputError: If the value is not convertable to a boolean
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
```python
|
|
40
|
+
result = xmllib.convert_to_bool_string(1)
|
|
41
|
+
# result == True
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
result = xmllib.convert_to_bool_string("nein")
|
|
46
|
+
# result == False
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
result = xmllib.convert_to_bool_string(None)
|
|
51
|
+
# raises XmllibInputError
|
|
52
|
+
```
|
|
14
53
|
"""
|
|
15
54
|
str_val = str(value).lower().strip()
|
|
16
|
-
if str_val in ("false", "0", "0.0", "no"):
|
|
17
|
-
return
|
|
18
|
-
elif str_val in ("true", "1", "1.0", "yes"):
|
|
19
|
-
return
|
|
20
|
-
|
|
55
|
+
if str_val in ("false", "0", "0.0", "no", "non", "nein"):
|
|
56
|
+
return False
|
|
57
|
+
elif str_val in ("true", "1", "1.0", "yes", "oui", "ja", "sì"):
|
|
58
|
+
return True
|
|
59
|
+
raise_xmllib_input_error(MessageInfo(f"The entered value '{value}' cannot be converted to a bool."))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def replace_newlines_with_tags(text: str, converter_option: NewlineReplacement) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Converts the newlines in a string to XML tags.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
text: string to convert
|
|
68
|
+
converter_option: specifies what tag to use instead of the newline
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
String with replaced values
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
XmllibInputError: If an invalid conversion option is given
|
|
75
|
+
|
|
76
|
+
Examples:
|
|
77
|
+
```python
|
|
78
|
+
result = xmllib.replace_newlines_with_tags(
|
|
79
|
+
"Start\\nEnd", xmllib.NewlineReplacement.NONE
|
|
80
|
+
)
|
|
81
|
+
# result == "Start\\nEnd"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
result = xmllib.replace_newlines_with_tags(
|
|
86
|
+
"Start\\nEnd", xmllib.NewlineReplacement.LINEBREAK
|
|
87
|
+
)
|
|
88
|
+
# result == "Start<br/>End"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
result = xmllib.replace_newlines_with_tags(
|
|
93
|
+
"Start\\n\\nEnd", xmllib.NewlineReplacement.PARAGRAPH
|
|
94
|
+
)
|
|
95
|
+
# result == "<p>Start</p><p>End</p>"
|
|
96
|
+
```
|
|
97
|
+
"""
|
|
98
|
+
match converter_option:
|
|
99
|
+
case NewlineReplacement.NONE:
|
|
100
|
+
return text
|
|
101
|
+
case NewlineReplacement.LINEBREAK:
|
|
102
|
+
return replace_newlines_with_br_tags(text)
|
|
103
|
+
case NewlineReplacement.PARAGRAPH:
|
|
104
|
+
return replace_newlines_with_paragraph_tags(text)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def replace_newlines_with_paragraph_tags(text: str) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Replace `Start\\nEnd` with `<p>Start</p><p>End</p>`
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
text: string to be formatted
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Formatted string with paragraph tags
|
|
116
|
+
|
|
117
|
+
Examples:
|
|
118
|
+
```python
|
|
119
|
+
result = xmllib.replace_newlines_with_paragraph_tags("Start\\nEnd")
|
|
120
|
+
# result == "<p>Start</p><p>End</p>"
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
# multiple consecutive newlines will be treated as one newline
|
|
125
|
+
|
|
126
|
+
result = xmllib.replace_newlines_with_paragraph_tags("Start\\n\\nEnd")
|
|
127
|
+
# result == "<p>Start</p><p>End</p>"
|
|
128
|
+
```
|
|
129
|
+
"""
|
|
130
|
+
splt = [x for x in text.split("\n") if x != ""]
|
|
131
|
+
formatted = [f"<p>{x}</p>" for x in splt]
|
|
132
|
+
return "".join(formatted)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def replace_newlines_with_br_tags(text: str) -> str:
|
|
136
|
+
"""
|
|
137
|
+
Replaces `\\n` with `<br/>`
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
text: string to be formatted
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Formatted string with break-line tags
|
|
144
|
+
|
|
145
|
+
Examples:
|
|
146
|
+
```python
|
|
147
|
+
result = xmllib.replace_newlines_with_br_tags("Start\\nEnd")
|
|
148
|
+
# result == "Start<br/>End"
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
# multiple consecutive newlines will be converted into multiple break-lines
|
|
153
|
+
|
|
154
|
+
result = xmllib.replace_newlines_with_br_tags("Start\\n\\nEnd")
|
|
155
|
+
# result == "Start<br/><br/>End"
|
|
156
|
+
```
|
|
157
|
+
"""
|
|
158
|
+
return text.replace("\n", "<br/>")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def reformat_date(
|
|
162
|
+
date: str | int,
|
|
163
|
+
date_precision_separator: str | None,
|
|
164
|
+
date_range_separator: str | None,
|
|
165
|
+
date_format: DateFormat,
|
|
166
|
+
calendar: Calendar = Calendar.GREGORIAN,
|
|
167
|
+
era: Era | None = Era.CE,
|
|
168
|
+
resource_id: str | None = None,
|
|
169
|
+
) -> str:
|
|
170
|
+
"""
|
|
171
|
+
Reformats a date string into the DSP format.
|
|
172
|
+
|
|
173
|
+
- If the input cannot be reformatted according to the configuration, or if the result
|
|
174
|
+
is not a valid DSP date, a warning is emitted and the original input is returned.
|
|
175
|
+
- If the input is empty, a warning is emitted and an empty string is returned.
|
|
176
|
+
- If the input is already a correctly formatted DSP-date, the original input is returned.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
date: date string to be reformatted
|
|
180
|
+
date_precision_separator: the separation between the day, month and year
|
|
181
|
+
date_range_separator: the separation between two dates
|
|
182
|
+
date_format: the format of the date, see [`DateFormat` for options](https://docs.dasch.swiss/latest/DSP-TOOLS/xmllib-docs/date_formats/#xmllib.models.date_formats.DateFormat)
|
|
183
|
+
calendar: the calendar of the date, see [`Calendar` for options](https://docs.dasch.swiss/latest/DSP-TOOLS/xmllib-docs/date_formats/#xmllib.models.date_formats.Calendar)
|
|
184
|
+
era: the era of the date, see [`Era` for options](https://docs.dasch.swiss/latest/DSP-TOOLS/xmllib-docs/date_formats/#xmllib.models.date_formats.Era)
|
|
185
|
+
resource_id: the ID of the associated resource, this is to improve the error message
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
A reformatted date or the original input if the reformatted result is not a valid DSP date
|
|
189
|
+
|
|
190
|
+
Examples:
|
|
191
|
+
```python
|
|
192
|
+
# default configuration, starting with the day
|
|
193
|
+
result = xmllib.reformat_date(
|
|
194
|
+
date="1.11.2000",
|
|
195
|
+
date_precision_separator=".",
|
|
196
|
+
date_range_separator=None,
|
|
197
|
+
date_format=xmllib.DateFormat.DD_MM_YYYY
|
|
198
|
+
)
|
|
199
|
+
# result == "GREGORIAN:CE:2000-11-1:CE:2000-11-1"
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
# default configuration, but starting with the year
|
|
204
|
+
result = xmllib.reformat_date(
|
|
205
|
+
date="2000.11.1",
|
|
206
|
+
date_precision_separator=".",
|
|
207
|
+
date_range_separator=None,
|
|
208
|
+
date_format=xmllib.DateFormat.YYYY_MM_DD,
|
|
209
|
+
)
|
|
210
|
+
# result == "GREGORIAN:CE:2000-11-1:CE:2000-11-1"
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
# with a date range
|
|
215
|
+
result = xmllib.reformat_date(
|
|
216
|
+
date="1.11.2000-2001",
|
|
217
|
+
date_precision_separator=".",
|
|
218
|
+
date_range_separator="-",
|
|
219
|
+
date_format=xmllib.DateFormat.DD_MM_YYYY,
|
|
220
|
+
)
|
|
221
|
+
# result == "GREGORIAN:CE:2000-11-1:CE:2001"
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
# islamic calendar, where eras are not allowed
|
|
226
|
+
result = xmllib.reformat_date(
|
|
227
|
+
date="1.11.2000",
|
|
228
|
+
date_precision_separator=".",
|
|
229
|
+
date_range_separator=None,
|
|
230
|
+
date_format=xmllib.DateFormat.DD_MM_YYYY,
|
|
231
|
+
calendar=xmllib.Calendar.ISLAMIC,
|
|
232
|
+
era=None
|
|
233
|
+
)
|
|
234
|
+
# result == "ISLAMIC:2000-11-1:2000-11-1"
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
# with a different era
|
|
239
|
+
result = xmllib.reformat_date(
|
|
240
|
+
date="1.11.2000",
|
|
241
|
+
date_precision_separator=".",
|
|
242
|
+
date_range_separator="-",
|
|
243
|
+
date_format=xmllib.DateFormat.DD_MM_YYYY,
|
|
244
|
+
era=xmllib.Era.AD
|
|
245
|
+
)
|
|
246
|
+
# result == "GREGORIAN:AD:2000-11-1:AD:2000-11-1"
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
# reformatted date, no precision in the date string is required
|
|
251
|
+
result = xmllib.reformat_date(
|
|
252
|
+
date="2000",
|
|
253
|
+
date_precision_separator=".",
|
|
254
|
+
date_range_separator="-",
|
|
255
|
+
date_format=xmllib.DateFormat.DD_MM_YYYY,
|
|
256
|
+
)
|
|
257
|
+
# result == "GREGORIAN:CE:2000:CE:2000"
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
# already correctly formatted date
|
|
262
|
+
result = xmllib.reformat_date(
|
|
263
|
+
date="GREGORIAN:CE:2000:CE:2000",
|
|
264
|
+
date_precision_separator=".",
|
|
265
|
+
date_range_separator="-",
|
|
266
|
+
date_format=xmllib.DateFormat.DD_MM_YYYY,
|
|
267
|
+
)
|
|
268
|
+
# result == "GREGORIAN:CE:2000:CE:2000"
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
# invalid input: a warning is emitted and the original input is returned
|
|
273
|
+
result = xmllib.reformat_date(
|
|
274
|
+
date="not-a-date",
|
|
275
|
+
date_precision_separator=".",
|
|
276
|
+
date_range_separator="-",
|
|
277
|
+
date_format=xmllib.DateFormat.DD_MM_YYYY,
|
|
278
|
+
)
|
|
279
|
+
# WARNING is emitted
|
|
280
|
+
# result == "not-a-date"
|
|
281
|
+
```
|
|
282
|
+
"""
|
|
283
|
+
if not is_nonempty_value_internal(date):
|
|
284
|
+
msg_info = MessageInfo(
|
|
285
|
+
"The date to be reformatted is empty. An empty string is returned.", resource_id=resource_id
|
|
286
|
+
)
|
|
287
|
+
emit_xmllib_input_warning(msg_info)
|
|
288
|
+
return ""
|
|
289
|
+
date = str(date).strip()
|
|
290
|
+
invalid_date_info = MessageInfo(
|
|
291
|
+
f"The provided date '{date}' does not conform to the expected format, the original value is returned.",
|
|
292
|
+
resource_id=resource_id,
|
|
293
|
+
)
|
|
294
|
+
# Here we want to check if the input is already a reformatted date. In that case, we would expect a calendar.
|
|
295
|
+
# The function that checks if an input is a valid date does not require a calendar,
|
|
296
|
+
# so unformatted input for example, '2000' may be accepted as a valid date.
|
|
297
|
+
if regex.search(r"(GREGORIAN|JULIAN|ISLAMIC)", date):
|
|
298
|
+
if is_date_internal(date):
|
|
299
|
+
return date
|
|
300
|
+
else:
|
|
301
|
+
emit_xmllib_input_warning(invalid_date_info)
|
|
302
|
+
return date
|
|
303
|
+
if date_precision_separator and date_range_separator:
|
|
304
|
+
if date_precision_separator == date_range_separator:
|
|
305
|
+
msg_info = MessageInfo(
|
|
306
|
+
f"The precision separator and range separator provided are identical: '{date_precision_separator}'. "
|
|
307
|
+
f"This is not allowed.",
|
|
308
|
+
resource_id=resource_id,
|
|
309
|
+
)
|
|
310
|
+
raise_xmllib_input_error(msg_info)
|
|
311
|
+
if date_range_separator is not None:
|
|
312
|
+
date_split = [found for x in date.split(date_range_separator) if (found := x.strip())]
|
|
313
|
+
else:
|
|
314
|
+
date_split = [date.strip()]
|
|
315
|
+
all_dates = [_reformat_single_date(x, date_precision_separator, date_format, resource_id) for x in date_split]
|
|
316
|
+
if era:
|
|
317
|
+
all_dates = [f"{era.value}:{x}" for x in all_dates]
|
|
318
|
+
if len(all_dates) == 1:
|
|
319
|
+
all_dates.append(all_dates[0])
|
|
320
|
+
reformatted_str = ":".join(all_dates)
|
|
321
|
+
if calendar:
|
|
322
|
+
reformatted_str = f"{calendar.value}:{reformatted_str}"
|
|
323
|
+
if is_date_internal(reformatted_str):
|
|
324
|
+
return reformatted_str
|
|
325
|
+
emit_xmllib_input_warning(invalid_date_info)
|
|
326
|
+
return date
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _reformat_single_date( # noqa: PLR0911 Too many return statements
|
|
330
|
+
single_date: str, date_precision_separator: str | None, date_format: DateFormat, resource_id: str | None
|
|
331
|
+
) -> str:
|
|
332
|
+
if date_precision_separator is None:
|
|
333
|
+
return single_date
|
|
334
|
+
date_split = [found for x in single_date.split(date_precision_separator) if (found := x.strip())]
|
|
335
|
+
if date_format == DateFormat.YYYY_MM_DD:
|
|
336
|
+
return "-".join(date_split)
|
|
337
|
+
if date_format == DateFormat.DD_MM_YYYY:
|
|
338
|
+
return "-".join(reversed(date_split))
|
|
339
|
+
if date_format == DateFormat.MM_DD_YYYY:
|
|
340
|
+
if len(date_split) == 3:
|
|
341
|
+
month, day, year = date_split
|
|
342
|
+
return f"{year}-{month}-{day}"
|
|
343
|
+
if len(date_split) == 2:
|
|
344
|
+
return "-".join(reversed(date_split))
|
|
345
|
+
if len(date_split) == 1:
|
|
346
|
+
return date_split.pop()
|
|
347
|
+
else:
|
|
348
|
+
msg_info = MessageInfo(
|
|
349
|
+
f"The provided input of a single date '{single_date}' could not be reformatted correctly.",
|
|
350
|
+
resource_id=resource_id,
|
|
351
|
+
)
|
|
352
|
+
emit_xmllib_input_warning(msg_info)
|
|
353
|
+
return single_date
|
|
354
|
+
msg_info = MessageInfo(
|
|
355
|
+
f"The provided date format '{date_format}' to reformat the date is invalid.",
|
|
356
|
+
resource_id=resource_id,
|
|
357
|
+
)
|
|
358
|
+
raise_xmllib_input_error(msg_info)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def find_dates_in_string(string: str) -> set[str]:
|
|
362
|
+
"""
|
|
363
|
+
Checks if a string contains date values (single dates, or date ranges),
|
|
364
|
+
and return all found dates as set of DSP-formatted strings.
|
|
365
|
+
Returns an empty set if no date was found.
|
|
366
|
+
[See XML documentation for details](https://docs.dasch.swiss/latest/DSP-TOOLS/file-formats/xml-data-file/#date).
|
|
367
|
+
|
|
368
|
+
Notes:
|
|
369
|
+
- If no era or calendar is given, dates are interpreted in the Common Era and the Gregorian calendar.
|
|
370
|
+
- Standalone numbers from 000-2999, in 3/4-digit form, are interpreted as years CE.
|
|
371
|
+
- If a number (with any number of digits) is followed by CE, C.E., AD, A.D., it is interpreted as years CE.
|
|
372
|
+
- If a number (with any number of digits) is followed by BCE, BC, B.C., B.C.E., av. J.-C.,
|
|
373
|
+
it is interpreted as years BCE.
|
|
374
|
+
- Dates written with slashes are always interpreted in a European manner: 5/11/2021 is the 5th of November.
|
|
375
|
+
- In the European notation, 2-digit years are expanded to 4 digits, with the current year as watershed:
|
|
376
|
+
- 30.4.24 -> 30.04.2024
|
|
377
|
+
- 30.4.50 -> 30.04.1950
|
|
378
|
+
|
|
379
|
+
Currently supported date formats:
|
|
380
|
+
- 0476-09-04 -> GREGORIAN:CE:0476-09-04:CE:0476-09-04
|
|
381
|
+
- 0476_09_04 -> GREGORIAN:CE:0476-09-04:CE:0476-09-04
|
|
382
|
+
- 30.4.2021 -> GREGORIAN:CE:2021-04-30:CE:2021-04-30
|
|
383
|
+
- 30.4.21 -> GREGORIAN:CE:2021-04-30:CE:2021-04-30
|
|
384
|
+
- 5/11/2021 -> GREGORIAN:CE:2021-11-05:CE:2021-11-05
|
|
385
|
+
- Jan 26, 1993 -> GREGORIAN:CE:1993-01-26:CE:1993-01-26
|
|
386
|
+
- 26 Jan 1993 -> GREGORIAN:CE:1993-01-26:CE:1993-01-26
|
|
387
|
+
- 26 January 1993 -> GREGORIAN:CE:1993-01-26:CE:1993-01-26
|
|
388
|
+
- 26. Jan. 1993 -> GREGORIAN:CE:1993-01-26:CE:1993-01-26
|
|
389
|
+
- 26. Januar 1993 -> GREGORIAN:CE:1993-01-26:CE:1993-01-26
|
|
390
|
+
- 28.2.-1.12.1515 -> GREGORIAN:CE:1515-02-28:CE:1515-12-01
|
|
391
|
+
- 25.-26.2.0800 -> GREGORIAN:CE:0800-02-25:CE:0800-02-26
|
|
392
|
+
- 1.9.2022-3.1.2024 -> GREGORIAN:CE:2022-09-01:CE:2024-01-03
|
|
393
|
+
- 1848 -> GREGORIAN:CE:1848:CE:1848
|
|
394
|
+
- 1849/1850 -> GREGORIAN:CE:1849:CE:1850
|
|
395
|
+
- 1849/50 -> GREGORIAN:CE:1849:CE:1850
|
|
396
|
+
- 1845-50 -> GREGORIAN:CE:1845:CE:1850
|
|
397
|
+
- 840-50 -> GREGORIAN:CE:840:CE:850
|
|
398
|
+
- 840-1 -> GREGORIAN:CE:840:CE:841
|
|
399
|
+
- 9 BC / 9 B.C. / 9 B.C.E. / 9 BCE -> GREGORIAN:BC:9:BC:9
|
|
400
|
+
- 20 BCE - 50 CE -> GREGORIAN:BC:20:CE:50
|
|
401
|
+
- 1000-900 av. J.-C. -> GREGORIAN:BC:1000:BC:900
|
|
402
|
+
- 45 av. J.-C. -> GREGORIAN:BC:45:BC:45
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
string: string to check
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
(possibly empty) set of DSP-formatted date strings
|
|
409
|
+
|
|
410
|
+
Examples:
|
|
411
|
+
```python
|
|
412
|
+
result = xmllib.find_dates_in_string("1849/1850")
|
|
413
|
+
# result == {"GREGORIAN:CE:1849:CE:1850"}
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
```python
|
|
417
|
+
result = xmllib.find_dates_in_string("not a valid date")
|
|
418
|
+
# result == {}
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
```python
|
|
422
|
+
result = xmllib.find_dates_in_string("first date: 2024. Second: 2025.")
|
|
423
|
+
# result == {"GREGORIAN:CE:2024:CE:2024", "GREGORIAN:CE:2025:CE:2025"}
|
|
424
|
+
```
|
|
425
|
+
"""
|
|
426
|
+
|
|
427
|
+
# sanitise input, just in case that the function was called on an empty or N/A cell
|
|
428
|
+
if not is_nonempty_value_internal(string):
|
|
429
|
+
return set()
|
|
430
|
+
return _find_dates_in_string(string)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
_months_dict = {
|
|
434
|
+
"January": 1,
|
|
435
|
+
"Januar": 1,
|
|
436
|
+
"Jan": 1,
|
|
437
|
+
"February": 2,
|
|
438
|
+
"Februar": 2,
|
|
439
|
+
"Feb": 2,
|
|
440
|
+
"March": 3,
|
|
441
|
+
"März": 3,
|
|
442
|
+
"Mar": 3,
|
|
443
|
+
"April": 4,
|
|
444
|
+
"Apr": 4,
|
|
445
|
+
"May": 5,
|
|
446
|
+
"Mai": 5,
|
|
447
|
+
"June": 6,
|
|
448
|
+
"Juni": 6,
|
|
449
|
+
"Jun": 6,
|
|
450
|
+
"July": 7,
|
|
451
|
+
"Juli": 7,
|
|
452
|
+
"Jul": 7,
|
|
453
|
+
"August": 8,
|
|
454
|
+
"Aug": 8,
|
|
455
|
+
"September": 9,
|
|
456
|
+
"Sept": 9,
|
|
457
|
+
"October": 10,
|
|
458
|
+
"Oktober": 10,
|
|
459
|
+
"Oct": 10,
|
|
460
|
+
"Okt": 10,
|
|
461
|
+
"November": 11,
|
|
462
|
+
"Nov": 11,
|
|
463
|
+
"December": 12,
|
|
464
|
+
"Dezember": 12,
|
|
465
|
+
"Dec": 12,
|
|
466
|
+
"Dez": 12,
|
|
467
|
+
}
|
|
468
|
+
all_months = "|".join(_months_dict)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _find_dates_in_string(string: str) -> set[str]:
|
|
472
|
+
year_regex = r"([0-2]?[0-9][0-9][0-9])"
|
|
473
|
+
year_regex_2_or_4_digits = r"((?:[0-2]?[0-9])?[0-9][0-9])"
|
|
474
|
+
month_regex = r"([0-1]?[0-9])"
|
|
475
|
+
day_regex = r"([0-3]?[0-9])"
|
|
476
|
+
sep_regex = r"[\./]"
|
|
477
|
+
lookbehind = r"(?<![0-9A-Za-z])"
|
|
478
|
+
lookahead = r"(?![0-9A-Za-z])"
|
|
479
|
+
range_operator_regex = r" ?- ?"
|
|
480
|
+
|
|
481
|
+
remaining_string = string
|
|
482
|
+
results: set[str | None] = set()
|
|
483
|
+
|
|
484
|
+
remaining_string = _extract_already_parsed_date(remaining_string, results)
|
|
485
|
+
|
|
486
|
+
remaining_string = _find_english_BC_or_CE_dates(
|
|
487
|
+
string=remaining_string,
|
|
488
|
+
lookbehind=lookbehind,
|
|
489
|
+
lookahead=lookahead,
|
|
490
|
+
range_operator_regex=range_operator_regex,
|
|
491
|
+
results=results,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
remaining_string = _find_french_bc_dates(
|
|
495
|
+
string=remaining_string,
|
|
496
|
+
lookbehind=lookbehind,
|
|
497
|
+
lookahead=lookahead,
|
|
498
|
+
range_operator_regex=range_operator_regex,
|
|
499
|
+
results=results,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# template: 2021-01-01 | 2015_01_02
|
|
503
|
+
iso_dates_regex = rf"{lookbehind}{year_regex}[_-]([0-1][0-9])[_-]([0-3][0-9]){lookahead}"
|
|
504
|
+
if iso_dates := list(regex.finditer(iso_dates_regex, remaining_string)):
|
|
505
|
+
results.update(_from_iso_date(x) for x in iso_dates)
|
|
506
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in iso_dates])
|
|
507
|
+
|
|
508
|
+
# template: 6.-8.3.1948 | 6/2/1947 - 24.03.1948
|
|
509
|
+
eur_date_range_regex = (
|
|
510
|
+
rf"{lookbehind}"
|
|
511
|
+
rf"{day_regex}{sep_regex}(?:{month_regex}{sep_regex}{year_regex_2_or_4_digits}?)?{range_operator_regex}"
|
|
512
|
+
rf"{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex_2_or_4_digits}"
|
|
513
|
+
rf"{lookahead}"
|
|
514
|
+
)
|
|
515
|
+
if eur_date_ranges := list(regex.finditer(eur_date_range_regex, remaining_string)):
|
|
516
|
+
results.update(_from_eur_date_range(x) for x in eur_date_ranges)
|
|
517
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in eur_date_ranges])
|
|
518
|
+
|
|
519
|
+
# template: 1.4.2021 | 5/11/2021
|
|
520
|
+
eur_date_regex = rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex_2_or_4_digits}{lookahead}"
|
|
521
|
+
if eur_dates := list(regex.finditer(eur_date_regex, remaining_string)):
|
|
522
|
+
results.update(_from_eur_date(x) for x in eur_dates)
|
|
523
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in eur_dates])
|
|
524
|
+
|
|
525
|
+
# template: March 9, 1908 | March5,1908 | May 11, 1906
|
|
526
|
+
monthname_date_regex = rf"{lookbehind}({all_months}) ?{day_regex}, ?{year_regex}{lookahead}"
|
|
527
|
+
if monthname_dates := list(regex.finditer(monthname_date_regex, remaining_string)):
|
|
528
|
+
results.update(_from_monthname_date(x) for x in monthname_dates)
|
|
529
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in monthname_dates])
|
|
530
|
+
|
|
531
|
+
# template: 9 March 1908
|
|
532
|
+
monthname_after_day_regex = rf"{lookbehind}{day_regex} ?({all_months}) ?{year_regex}{lookahead}"
|
|
533
|
+
if monthname_after_days := list(regex.finditer(monthname_after_day_regex, remaining_string)):
|
|
534
|
+
results.update(_from_monthname_after_day(x) for x in monthname_after_days)
|
|
535
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in monthname_after_days])
|
|
536
|
+
|
|
537
|
+
# template: 26. Januar 1993 | 26. Jan. 1993 | 26. Jan 1993
|
|
538
|
+
german_monthname_date_regex = rf"{lookbehind}{day_regex}\.? ?({all_months})\.? ?{year_regex}{lookahead}"
|
|
539
|
+
if german_monthname_dates := list(regex.finditer(german_monthname_date_regex, remaining_string)):
|
|
540
|
+
results.update(_from_german_monthname_date(x) for x in german_monthname_dates)
|
|
541
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in german_monthname_dates])
|
|
542
|
+
|
|
543
|
+
# template: 1849/50 | 1849-50 | 1849/1850
|
|
544
|
+
if year_ranges := list(regex.finditer(lookbehind + year_regex + r"[/-](\d{1,4})" + lookahead, remaining_string)):
|
|
545
|
+
results.update(_from_year_range(x) for x in year_ranges)
|
|
546
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in year_ranges])
|
|
547
|
+
|
|
548
|
+
# template: 1907
|
|
549
|
+
if year_onlies := list(regex.finditer(rf"{lookbehind}{year_regex}{lookahead}", remaining_string)):
|
|
550
|
+
results.update(f"GREGORIAN:CE:{int(x.group(0))}:CE:{int(x.group(0))}" for x in year_onlies)
|
|
551
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in year_onlies])
|
|
552
|
+
|
|
553
|
+
return {x for x in results if x}
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def _remove_used_spans(string: str, spans: list[tuple[int, int]]) -> str:
|
|
557
|
+
"""Once a regex has matched parts of the original string, remove these parts, so that they're not matched again."""
|
|
558
|
+
for start, end in reversed(spans):
|
|
559
|
+
string = string[:start] + string[end:]
|
|
560
|
+
return string
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _find_english_BC_or_CE_dates(
|
|
564
|
+
string: str,
|
|
565
|
+
lookbehind: str,
|
|
566
|
+
lookahead: str,
|
|
567
|
+
range_operator_regex: str,
|
|
568
|
+
results: set[str | None],
|
|
569
|
+
) -> str:
|
|
570
|
+
eraless_date_regex = r"(\d+)"
|
|
571
|
+
bc_era_regex = r"(?:BC|BCE|B\.C\.|B\.C\.E\.)"
|
|
572
|
+
bc_date_regex = rf"(?:{eraless_date_regex} ?{bc_era_regex})"
|
|
573
|
+
ce_era_regex = r"(?:CE|AD|C\.E\.|A\.D\.)"
|
|
574
|
+
ce_date_regex = rf"(?:{eraless_date_regex} ?{ce_era_regex})"
|
|
575
|
+
bc_or_ce_date_regex = rf"(?:{bc_date_regex}|{ce_date_regex})"
|
|
576
|
+
|
|
577
|
+
remaining_string = string
|
|
578
|
+
results_new: set[str | None] = set()
|
|
579
|
+
|
|
580
|
+
range_regex = (
|
|
581
|
+
rf"{lookbehind}(?:{bc_or_ce_date_regex}|{eraless_date_regex})"
|
|
582
|
+
rf"{range_operator_regex}"
|
|
583
|
+
rf"{bc_or_ce_date_regex}{lookahead}"
|
|
584
|
+
)
|
|
585
|
+
if matchs := list(regex.finditer(range_regex, remaining_string)):
|
|
586
|
+
results_new.update(
|
|
587
|
+
_from_english_BC_or_CE_range(
|
|
588
|
+
string=x.group(0),
|
|
589
|
+
range_operator_regex=range_operator_regex,
|
|
590
|
+
bc_era_regex=bc_era_regex,
|
|
591
|
+
ce_era_regex=ce_era_regex,
|
|
592
|
+
eraless_date_regex=eraless_date_regex,
|
|
593
|
+
)
|
|
594
|
+
for x in matchs
|
|
595
|
+
)
|
|
596
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in matchs])
|
|
597
|
+
|
|
598
|
+
if matchs := list(regex.finditer(rf"{lookbehind}{bc_date_regex}{lookahead}", remaining_string)):
|
|
599
|
+
results_new.update({f"GREGORIAN:BC:{x.group(1)}:BC:{x.group(1)}" for x in matchs})
|
|
600
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in matchs])
|
|
601
|
+
|
|
602
|
+
if matchs := list(regex.finditer(rf"{lookbehind}{ce_date_regex}{lookahead}", remaining_string)):
|
|
603
|
+
results_new.update({f"GREGORIAN:CE:{x.group(1)}:CE:{x.group(1)}" for x in matchs})
|
|
604
|
+
remaining_string = _remove_used_spans(remaining_string, [x.span() for x in matchs])
|
|
605
|
+
|
|
606
|
+
results.update({x for x in results_new if x})
|
|
607
|
+
return remaining_string
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def _from_english_BC_or_CE_range(
|
|
611
|
+
string: str, range_operator_regex: str, bc_era_regex: str, ce_era_regex: str, eraless_date_regex: str
|
|
612
|
+
) -> str | None:
|
|
613
|
+
split_result = regex.split(range_operator_regex, string)
|
|
614
|
+
if len(split_result) != 2:
|
|
615
|
+
return None
|
|
616
|
+
start_raw, end_raw = split_result
|
|
617
|
+
if regex.search(bc_era_regex, end_raw):
|
|
618
|
+
end_era = "BC"
|
|
619
|
+
elif regex.search(ce_era_regex, end_raw):
|
|
620
|
+
end_era = "CE"
|
|
621
|
+
else:
|
|
622
|
+
return None
|
|
623
|
+
|
|
624
|
+
if regex.search(bc_era_regex, start_raw):
|
|
625
|
+
start_era = "BC"
|
|
626
|
+
elif regex.search(ce_era_regex, start_raw):
|
|
627
|
+
start_era = "CE"
|
|
628
|
+
else:
|
|
629
|
+
start_era = end_era
|
|
630
|
+
|
|
631
|
+
if not (start_year_match := regex.search(eraless_date_regex, start_raw)):
|
|
632
|
+
return None
|
|
633
|
+
if not (end_year_match := regex.search(eraless_date_regex, end_raw)):
|
|
634
|
+
return None
|
|
635
|
+
|
|
636
|
+
return f"GREGORIAN:{start_era}:{start_year_match.group(0)}:{end_era}:{end_year_match.group(0)}"
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def _find_french_bc_dates(
|
|
640
|
+
string: str,
|
|
641
|
+
lookbehind: str,
|
|
642
|
+
lookahead: str,
|
|
643
|
+
range_operator_regex: str,
|
|
644
|
+
results: set[str | None],
|
|
645
|
+
) -> str:
|
|
646
|
+
remaining_string = string
|
|
647
|
+
results_new: set[str | None] = set()
|
|
648
|
+
french_bc_regex = r"av(?:\. |\.| )J\.?-?C\.?"
|
|
649
|
+
|
|
650
|
+
year_regex = r"\d{1,5}"
|
|
651
|
+
year_range_regex = rf"{lookbehind}({year_regex}){range_operator_regex}({year_regex}) {french_bc_regex}{lookahead}"
|
|
652
|
+
for year_range in reversed(list(regex.finditer(year_range_regex, remaining_string))):
|
|
653
|
+
start_year = int(year_range.group(1))
|
|
654
|
+
end_year = int(year_range.group(2))
|
|
655
|
+
if end_year > start_year:
|
|
656
|
+
continue
|
|
657
|
+
results_new.add(f"GREGORIAN:BC:{start_year}:BC:{end_year}")
|
|
658
|
+
remaining_string = _remove_used_spans(remaining_string, [year_range.span()])
|
|
659
|
+
|
|
660
|
+
single_year_regex = rf"{lookbehind}({year_regex}) {french_bc_regex}{lookahead}"
|
|
661
|
+
for single_year in reversed(list(regex.finditer(single_year_regex, remaining_string))):
|
|
662
|
+
start_year = int(single_year.group(1))
|
|
663
|
+
results_new.add(f"GREGORIAN:BC:{start_year}:BC:{start_year}")
|
|
664
|
+
remaining_string = _remove_used_spans(remaining_string, [single_year.span()])
|
|
665
|
+
|
|
666
|
+
results.update({x for x in results_new if x})
|
|
667
|
+
return remaining_string
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def _from_iso_date(iso_date: Match[str]) -> str | None:
|
|
671
|
+
year = int(iso_date.group(1))
|
|
672
|
+
month = int(iso_date.group(2))
|
|
673
|
+
day = int(iso_date.group(3))
|
|
674
|
+
try:
|
|
675
|
+
date = datetime.date(year, month, day)
|
|
676
|
+
return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}"
|
|
677
|
+
except ValueError:
|
|
678
|
+
return None
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def _expand_2_digit_year(year: int) -> int:
|
|
682
|
+
current_year = datetime.date.today().year - 2000
|
|
683
|
+
if year <= current_year:
|
|
684
|
+
return year + 2000
|
|
685
|
+
elif year <= 99:
|
|
686
|
+
return year + 1900
|
|
687
|
+
else:
|
|
688
|
+
return year
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def _from_eur_date_range(eur_date_range: Match[str]) -> str | None:
|
|
692
|
+
startday = int(eur_date_range.group(1))
|
|
693
|
+
startmonth = int(eur_date_range.group(2)) if eur_date_range.group(2) else int(eur_date_range.group(5))
|
|
694
|
+
startyear = int(eur_date_range.group(3)) if eur_date_range.group(3) else int(eur_date_range.group(6))
|
|
695
|
+
startyear = _expand_2_digit_year(startyear)
|
|
696
|
+
endday = int(eur_date_range.group(4))
|
|
697
|
+
endmonth = int(eur_date_range.group(5))
|
|
698
|
+
endyear = int(eur_date_range.group(6))
|
|
699
|
+
endyear = _expand_2_digit_year(endyear)
|
|
700
|
+
try:
|
|
701
|
+
startdate = datetime.date(startyear, startmonth, startday)
|
|
702
|
+
enddate = datetime.date(endyear, endmonth, endday)
|
|
703
|
+
except ValueError:
|
|
704
|
+
return None
|
|
705
|
+
if enddate < startdate:
|
|
706
|
+
return None
|
|
707
|
+
return f"GREGORIAN:CE:{startdate.isoformat()}:CE:{enddate.isoformat()}"
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def _from_eur_date(eur_date: Match[str]) -> str | None:
|
|
711
|
+
startday = int(eur_date.group(1))
|
|
712
|
+
startmonth = int(eur_date.group(2))
|
|
713
|
+
startyear = int(eur_date.group(3))
|
|
714
|
+
startyear = _expand_2_digit_year(startyear)
|
|
715
|
+
try:
|
|
716
|
+
date = datetime.date(startyear, startmonth, startday)
|
|
717
|
+
return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}"
|
|
718
|
+
except ValueError:
|
|
719
|
+
return None
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def _from_monthname_date(monthname_date: Match[str]) -> str | None:
|
|
723
|
+
day = int(monthname_date.group(2))
|
|
724
|
+
month = _months_dict[monthname_date.group(1)]
|
|
725
|
+
year = int(monthname_date.group(3))
|
|
726
|
+
try:
|
|
727
|
+
date = datetime.date(year, month, day)
|
|
728
|
+
return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}"
|
|
729
|
+
except ValueError:
|
|
730
|
+
return None
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _from_monthname_after_day(monthname_after_day: Match[str]) -> str | None:
|
|
734
|
+
day = int(monthname_after_day.group(1))
|
|
735
|
+
month = _months_dict[monthname_after_day.group(2)]
|
|
736
|
+
year = int(monthname_after_day.group(3))
|
|
737
|
+
try:
|
|
738
|
+
date = datetime.date(year, month, day)
|
|
739
|
+
return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}"
|
|
740
|
+
except ValueError:
|
|
741
|
+
return None
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
def _from_german_monthname_date(german_monthname_date: Match[str]) -> str | None:
|
|
745
|
+
day = int(german_monthname_date.group(1))
|
|
746
|
+
month = _months_dict[german_monthname_date.group(2)]
|
|
747
|
+
year = int(german_monthname_date.group(3))
|
|
748
|
+
try:
|
|
749
|
+
date = datetime.date(year, month, day)
|
|
750
|
+
return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}"
|
|
751
|
+
except ValueError:
|
|
752
|
+
return None
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
def _from_year_range(year_range: Match[str]) -> str | None:
|
|
756
|
+
startyear = int(year_range.group(1))
|
|
757
|
+
endyear = int(year_range.group(2))
|
|
758
|
+
if endyear // 10 == 0:
|
|
759
|
+
# endyear is only 1-digit: add the first 2-3 digits of startyear
|
|
760
|
+
endyear = startyear // 10 * 10 + endyear
|
|
761
|
+
elif endyear // 100 == 0:
|
|
762
|
+
# endyear is only 2-digit: add the first 1-2 digits of startyear
|
|
763
|
+
endyear = startyear // 100 * 100 + endyear
|
|
764
|
+
if endyear <= startyear:
|
|
765
|
+
return None
|
|
766
|
+
return f"GREGORIAN:CE:{startyear}:CE:{endyear}"
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def _extract_already_parsed_date(string: str, results: set[str | None]) -> str:
|
|
770
|
+
rgx_year = r"\d+(-\d{2}(-\d{2})?)?"
|
|
771
|
+
era_with_colon = r"(CE:|BC:)"
|
|
772
|
+
rgx = rf"(GREGORIAN|JULIAN|ISLAMIC):{era_with_colon}{rgx_year}:{era_with_colon}?{rgx_year}"
|
|
773
|
+
if matchs := list(regex.finditer(rgx, string)):
|
|
774
|
+
results.update({x.group(0) for x in matchs})
|
|
775
|
+
remaining_string = _remove_used_spans(string, [x.span() for x in matchs])
|
|
776
|
+
return remaining_string
|
|
777
|
+
return string
|