dsp-tools 0.9.13__py3-none-any.whl → 18.3.0.post13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsp_tools/__init__.py +5 -0
- dsp_tools/cli/args.py +47 -0
- dsp_tools/cli/call_action.py +85 -0
- dsp_tools/cli/call_action_files_only.py +101 -0
- dsp_tools/cli/call_action_with_network.py +207 -0
- dsp_tools/cli/create_parsers.py +479 -0
- dsp_tools/cli/entry_point.py +322 -0
- dsp_tools/cli/utils.py +87 -0
- dsp_tools/clients/CLAUDE.md +420 -0
- dsp_tools/clients/authentication_client.py +14 -0
- dsp_tools/clients/authentication_client_live.py +66 -0
- dsp_tools/clients/connection.py +35 -0
- dsp_tools/clients/connection_live.py +233 -0
- dsp_tools/clients/fuseki_metrics.py +60 -0
- dsp_tools/clients/group_user_clients.py +35 -0
- dsp_tools/clients/group_user_clients_live.py +181 -0
- dsp_tools/clients/legal_info_client.py +23 -0
- dsp_tools/clients/legal_info_client_live.py +132 -0
- dsp_tools/clients/list_client.py +49 -0
- dsp_tools/clients/list_client_live.py +166 -0
- dsp_tools/clients/metadata_client.py +24 -0
- dsp_tools/clients/metadata_client_live.py +47 -0
- dsp_tools/clients/ontology_clients.py +49 -0
- dsp_tools/clients/ontology_create_client_live.py +166 -0
- dsp_tools/clients/ontology_get_client_live.py +80 -0
- dsp_tools/clients/permissions_client.py +68 -0
- dsp_tools/clients/project_client.py +16 -0
- dsp_tools/clients/project_client_live.py +66 -0
- dsp_tools/commands/create/communicate_problems.py +24 -0
- dsp_tools/commands/create/create.py +134 -0
- dsp_tools/commands/create/create_on_server/cardinalities.py +111 -0
- dsp_tools/commands/create/create_on_server/classes.py +99 -0
- dsp_tools/commands/create/create_on_server/complete_ontologies.py +116 -0
- dsp_tools/commands/create/create_on_server/default_permissions.py +134 -0
- dsp_tools/commands/create/create_on_server/group_users.py +165 -0
- dsp_tools/commands/create/create_on_server/lists.py +163 -0
- dsp_tools/commands/create/create_on_server/mappers.py +12 -0
- dsp_tools/commands/create/create_on_server/onto_utils.py +74 -0
- dsp_tools/commands/create/create_on_server/ontology.py +52 -0
- dsp_tools/commands/create/create_on_server/project.py +68 -0
- dsp_tools/commands/create/create_on_server/properties.py +119 -0
- dsp_tools/commands/create/exceptions.py +29 -0
- dsp_tools/commands/create/lists_only.py +66 -0
- dsp_tools/commands/create/models/create_problems.py +87 -0
- dsp_tools/commands/create/models/parsed_ontology.py +88 -0
- dsp_tools/commands/create/models/parsed_project.py +81 -0
- dsp_tools/commands/create/models/rdf_ontology.py +12 -0
- dsp_tools/commands/create/models/server_project_info.py +100 -0
- dsp_tools/commands/create/parsing/parse_lists.py +45 -0
- dsp_tools/commands/create/parsing/parse_ontology.py +243 -0
- dsp_tools/commands/create/parsing/parse_project.py +149 -0
- dsp_tools/commands/create/parsing/parsing_utils.py +40 -0
- dsp_tools/commands/create/project_validate.py +595 -0
- dsp_tools/commands/create/serialisation/ontology.py +119 -0
- dsp_tools/commands/create/serialisation/project.py +44 -0
- dsp_tools/commands/excel2json/CLAUDE.md +101 -0
- dsp_tools/commands/excel2json/json_header.py +321 -0
- dsp_tools/commands/excel2json/lists/__init__.py +0 -0
- dsp_tools/commands/excel2json/lists/compliance_checks.py +292 -0
- dsp_tools/commands/excel2json/lists/make_lists.py +247 -0
- dsp_tools/commands/excel2json/lists/models/__init__.py +0 -0
- dsp_tools/commands/excel2json/lists/models/deserialise.py +30 -0
- dsp_tools/commands/excel2json/lists/models/input_error.py +216 -0
- dsp_tools/commands/excel2json/lists/models/serialise.py +57 -0
- dsp_tools/commands/excel2json/lists/utils.py +81 -0
- dsp_tools/commands/excel2json/models/__init__.py +0 -0
- dsp_tools/commands/excel2json/models/input_error.py +416 -0
- dsp_tools/commands/excel2json/models/json_header.py +175 -0
- dsp_tools/commands/excel2json/models/list_node_name.py +16 -0
- dsp_tools/commands/excel2json/models/ontology.py +76 -0
- dsp_tools/commands/excel2json/old_lists.py +328 -0
- dsp_tools/commands/excel2json/project.py +280 -0
- dsp_tools/commands/excel2json/properties.py +370 -0
- dsp_tools/commands/excel2json/resources.py +336 -0
- dsp_tools/commands/excel2json/utils.py +352 -0
- dsp_tools/commands/excel2xml/__init__.py +7 -0
- dsp_tools/commands/excel2xml/excel2xml_cli.py +523 -0
- dsp_tools/commands/excel2xml/excel2xml_lib.py +1953 -0
- dsp_tools/commands/excel2xml/propertyelement.py +47 -0
- dsp_tools/commands/get/__init__.py +0 -0
- dsp_tools/commands/get/get.py +166 -0
- dsp_tools/commands/get/get_permissions.py +257 -0
- dsp_tools/commands/get/get_permissions_legacy.py +89 -0
- dsp_tools/commands/get/legacy_models/__init__.py +0 -0
- dsp_tools/commands/get/legacy_models/context.py +318 -0
- dsp_tools/commands/get/legacy_models/group.py +241 -0
- dsp_tools/commands/get/legacy_models/helpers.py +47 -0
- dsp_tools/commands/get/legacy_models/listnode.py +390 -0
- dsp_tools/commands/get/legacy_models/model.py +12 -0
- dsp_tools/commands/get/legacy_models/ontology.py +324 -0
- dsp_tools/commands/get/legacy_models/project.py +366 -0
- dsp_tools/commands/get/legacy_models/propertyclass.py +417 -0
- dsp_tools/commands/get/legacy_models/resourceclass.py +676 -0
- dsp_tools/commands/get/legacy_models/user.py +438 -0
- dsp_tools/commands/get/models/__init__.py +0 -0
- dsp_tools/commands/get/models/permissions_models.py +10 -0
- dsp_tools/commands/id2iri.py +258 -0
- dsp_tools/commands/ingest_xmlupload/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/bulk_ingest_client.py +178 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/apply_ingest_id.py +69 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/upload_xml.py +166 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/user_information.py +121 -0
- dsp_tools/commands/ingest_xmlupload/ingest_files/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/ingest_files/ingest_files.py +64 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/filechecker.py +20 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py +57 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_failures.py +66 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_files.py +67 -0
- dsp_tools/commands/resume_xmlupload/__init__.py +0 -0
- dsp_tools/commands/resume_xmlupload/resume_xmlupload.py +96 -0
- dsp_tools/commands/start_stack.py +428 -0
- dsp_tools/commands/update_legal/CLAUDE.md +344 -0
- dsp_tools/commands/update_legal/__init__.py +0 -0
- dsp_tools/commands/update_legal/core.py +182 -0
- dsp_tools/commands/update_legal/csv_operations.py +135 -0
- dsp_tools/commands/update_legal/models.py +87 -0
- dsp_tools/commands/update_legal/xml_operations.py +247 -0
- dsp_tools/commands/validate_data/CLAUDE.md +159 -0
- dsp_tools/commands/validate_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/constants.py +59 -0
- dsp_tools/commands/validate_data/mappers.py +143 -0
- dsp_tools/commands/validate_data/models/__init__.py +0 -0
- dsp_tools/commands/validate_data/models/api_responses.py +45 -0
- dsp_tools/commands/validate_data/models/input_problems.py +119 -0
- dsp_tools/commands/validate_data/models/rdf_like_data.py +117 -0
- dsp_tools/commands/validate_data/models/validation.py +106 -0
- dsp_tools/commands/validate_data/prepare_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/prepare_data/get_rdf_like_data.py +296 -0
- dsp_tools/commands/validate_data/prepare_data/make_data_graph.py +91 -0
- dsp_tools/commands/validate_data/prepare_data/prepare_data.py +184 -0
- dsp_tools/commands/validate_data/process_validation_report/__init__.py +0 -0
- dsp_tools/commands/validate_data/process_validation_report/get_user_validation_message.py +358 -0
- dsp_tools/commands/validate_data/process_validation_report/query_validation_result.py +507 -0
- dsp_tools/commands/validate_data/process_validation_report/reformat_validation_results.py +150 -0
- dsp_tools/commands/validate_data/shacl_cli_validator.py +70 -0
- dsp_tools/commands/validate_data/sparql/__init__.py +0 -0
- dsp_tools/commands/validate_data/sparql/cardinality_shacl.py +209 -0
- dsp_tools/commands/validate_data/sparql/construct_shacl.py +92 -0
- dsp_tools/commands/validate_data/sparql/legal_info_shacl.py +36 -0
- dsp_tools/commands/validate_data/sparql/value_shacl.py +357 -0
- dsp_tools/commands/validate_data/utils.py +59 -0
- dsp_tools/commands/validate_data/validate_data.py +283 -0
- dsp_tools/commands/validate_data/validation/__init__.py +0 -0
- dsp_tools/commands/validate_data/validation/check_duplicate_files.py +55 -0
- dsp_tools/commands/validate_data/validation/check_for_unknown_classes.py +67 -0
- dsp_tools/commands/validate_data/validation/get_validation_report.py +94 -0
- dsp_tools/commands/validate_data/validation/validate_ontology.py +107 -0
- dsp_tools/commands/xmlupload/CLAUDE.md +292 -0
- dsp_tools/commands/xmlupload/__init__.py +0 -0
- dsp_tools/commands/xmlupload/iri_resolver.py +21 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/__init__.py +0 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/constants.py +63 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/jsonld_utils.py +44 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_file_value.py +77 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_resource_and_values.py +114 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_values.py +262 -0
- dsp_tools/commands/xmlupload/models/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/bitstream_info.py +18 -0
- dsp_tools/commands/xmlupload/models/formatted_text_value.py +10 -0
- dsp_tools/commands/xmlupload/models/ingest.py +143 -0
- dsp_tools/commands/xmlupload/models/input_problems.py +58 -0
- dsp_tools/commands/xmlupload/models/lookup_models.py +21 -0
- dsp_tools/commands/xmlupload/models/permission.py +45 -0
- dsp_tools/commands/xmlupload/models/permissions_parsed.py +93 -0
- dsp_tools/commands/xmlupload/models/processed/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/processed/file_values.py +29 -0
- dsp_tools/commands/xmlupload/models/processed/res.py +27 -0
- dsp_tools/commands/xmlupload/models/processed/values.py +101 -0
- dsp_tools/commands/xmlupload/models/rdf_models.py +26 -0
- dsp_tools/commands/xmlupload/models/upload_clients.py +14 -0
- dsp_tools/commands/xmlupload/models/upload_state.py +20 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/__init__.py +0 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/ark2iri.py +55 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/get_processed_resources.py +252 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/iiif_uri_validator.py +50 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/list_client.py +120 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/prepare_xml_input.py +67 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/read_validate_xml_file.py +58 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/transform_input_values.py +118 -0
- dsp_tools/commands/xmlupload/resource_create_client.py +25 -0
- dsp_tools/commands/xmlupload/richtext_id2iri.py +37 -0
- dsp_tools/commands/xmlupload/stash/__init__.py +0 -0
- dsp_tools/commands/xmlupload/stash/analyse_circular_reference_graph.py +236 -0
- dsp_tools/commands/xmlupload/stash/create_info_for_graph.py +53 -0
- dsp_tools/commands/xmlupload/stash/graph_models.py +87 -0
- dsp_tools/commands/xmlupload/stash/stash_circular_references.py +68 -0
- dsp_tools/commands/xmlupload/stash/stash_models.py +109 -0
- dsp_tools/commands/xmlupload/stash/upload_stashed_resptr_props.py +106 -0
- dsp_tools/commands/xmlupload/stash/upload_stashed_xml_texts.py +196 -0
- dsp_tools/commands/xmlupload/upload_config.py +76 -0
- dsp_tools/commands/xmlupload/write_diagnostic_info.py +27 -0
- dsp_tools/commands/xmlupload/xmlupload.py +516 -0
- dsp_tools/config/__init__.py +0 -0
- dsp_tools/config/logger_config.py +69 -0
- dsp_tools/config/warnings_config.py +32 -0
- dsp_tools/error/__init__.py +0 -0
- dsp_tools/error/custom_warnings.py +39 -0
- dsp_tools/error/exceptions.py +204 -0
- dsp_tools/error/problems.py +10 -0
- dsp_tools/error/xmllib_errors.py +20 -0
- dsp_tools/error/xmllib_warnings.py +54 -0
- dsp_tools/error/xmllib_warnings_util.py +159 -0
- dsp_tools/error/xsd_validation_error_msg.py +19 -0
- dsp_tools/legacy_models/__init__.py +0 -0
- dsp_tools/legacy_models/datetimestamp.py +81 -0
- dsp_tools/legacy_models/langstring.py +253 -0
- dsp_tools/legacy_models/projectContext.py +49 -0
- dsp_tools/py.typed +0 -0
- dsp_tools/resources/schema/data.xsd +648 -0
- dsp_tools/resources/schema/lists-only.json +72 -0
- dsp_tools/resources/schema/project.json +1258 -0
- dsp_tools/resources/schema/properties-only.json +874 -0
- dsp_tools/resources/schema/resources-only.json +140 -0
- dsp_tools/resources/start-stack/docker-compose.override-host.j2 +11 -0
- dsp_tools/resources/start-stack/docker-compose.override.yml +11 -0
- dsp_tools/resources/start-stack/docker-compose.yml +88 -0
- dsp_tools/resources/start-stack/dsp-app-config.json +45 -0
- dsp_tools/resources/start-stack/dsp-app-config.override-host.j2 +26 -0
- dsp_tools/resources/validate_data/api-shapes-resource-cardinalities.ttl +191 -0
- dsp_tools/resources/validate_data/api-shapes.ttl +804 -0
- dsp_tools/resources/validate_data/shacl-cli-image.yml +4 -0
- dsp_tools/resources/validate_data/validate-ontology.ttl +99 -0
- dsp_tools/utils/__init__.py +0 -0
- dsp_tools/utils/ansi_colors.py +32 -0
- dsp_tools/utils/data_formats/__init__.py +0 -0
- dsp_tools/utils/data_formats/date_util.py +166 -0
- dsp_tools/utils/data_formats/iri_util.py +30 -0
- dsp_tools/utils/data_formats/shared.py +81 -0
- dsp_tools/utils/data_formats/uri_util.py +76 -0
- dsp_tools/utils/fuseki_bloating.py +63 -0
- dsp_tools/utils/json_parsing.py +22 -0
- dsp_tools/utils/rdf_constants.py +42 -0
- dsp_tools/utils/rdflib_utils.py +10 -0
- dsp_tools/utils/replace_id_with_iri.py +66 -0
- dsp_tools/utils/request_utils.py +238 -0
- dsp_tools/utils/xml_parsing/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/get_lookups.py +32 -0
- dsp_tools/utils/xml_parsing/get_parsed_resources.py +325 -0
- dsp_tools/utils/xml_parsing/models/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/models/parsed_resource.py +76 -0
- dsp_tools/utils/xml_parsing/parse_clean_validate_xml.py +137 -0
- dsp_tools/xmllib/CLAUDE.md +302 -0
- dsp_tools/xmllib/__init__.py +49 -0
- dsp_tools/xmllib/general_functions.py +877 -0
- dsp_tools/xmllib/internal/__init__.py +0 -0
- dsp_tools/xmllib/internal/checkers.py +162 -0
- dsp_tools/xmllib/internal/circumvent_circular_imports.py +36 -0
- dsp_tools/xmllib/internal/constants.py +46 -0
- dsp_tools/xmllib/internal/input_converters.py +155 -0
- dsp_tools/xmllib/internal/serialise_file_value.py +57 -0
- dsp_tools/xmllib/internal/serialise_resource.py +177 -0
- dsp_tools/xmllib/internal/serialise_values.py +152 -0
- dsp_tools/xmllib/internal/type_aliases.py +11 -0
- dsp_tools/xmllib/models/__init__.py +0 -0
- dsp_tools/xmllib/models/config_options.py +28 -0
- dsp_tools/xmllib/models/date_formats.py +48 -0
- dsp_tools/xmllib/models/dsp_base_resources.py +1542 -0
- dsp_tools/xmllib/models/internal/__init__.py +0 -0
- dsp_tools/xmllib/models/internal/file_values.py +172 -0
- dsp_tools/xmllib/models/internal/geometry.py +162 -0
- dsp_tools/xmllib/models/internal/migration_metadata.py +55 -0
- dsp_tools/xmllib/models/internal/serialise_permissions.py +66 -0
- dsp_tools/xmllib/models/internal/values.py +342 -0
- dsp_tools/xmllib/models/licenses/__init__.py +0 -0
- dsp_tools/xmllib/models/licenses/other.py +59 -0
- dsp_tools/xmllib/models/licenses/recommended.py +107 -0
- dsp_tools/xmllib/models/permissions.py +41 -0
- dsp_tools/xmllib/models/res.py +1782 -0
- dsp_tools/xmllib/models/root.py +348 -0
- dsp_tools/xmllib/value_checkers.py +434 -0
- dsp_tools/xmllib/value_converters.py +777 -0
- dsp_tools-18.3.0.post13.dist-info/METADATA +90 -0
- dsp_tools-18.3.0.post13.dist-info/RECORD +286 -0
- dsp_tools-18.3.0.post13.dist-info/WHEEL +4 -0
- dsp_tools-18.3.0.post13.dist-info/entry_points.txt +3 -0
- dsp_tools-0.9.13.dist-info/LICENSE +0 -674
- dsp_tools-0.9.13.dist-info/METADATA +0 -144
- dsp_tools-0.9.13.dist-info/RECORD +0 -71
- dsp_tools-0.9.13.dist-info/WHEEL +0 -5
- dsp_tools-0.9.13.dist-info/entry_points.txt +0 -3
- dsp_tools-0.9.13.dist-info/top_level.txt +0 -1
- dsplib/models/connection.py +0 -272
- dsplib/models/group.py +0 -296
- dsplib/models/helpers.py +0 -505
- dsplib/models/langstring.py +0 -277
- dsplib/models/listnode.py +0 -578
- dsplib/models/model.py +0 -20
- dsplib/models/ontology.py +0 -448
- dsplib/models/permission.py +0 -112
- dsplib/models/project.py +0 -547
- dsplib/models/propertyclass.py +0 -505
- dsplib/models/resource.py +0 -366
- dsplib/models/resourceclass.py +0 -810
- dsplib/models/sipi.py +0 -30
- dsplib/models/user.py +0 -731
- dsplib/models/value.py +0 -1000
- dsplib/utils/knora-data-schema.xsd +0 -454
- dsplib/utils/knora-schema-lists.json +0 -83
- dsplib/utils/knora-schema.json +0 -434
- dsplib/utils/onto_commons.py +0 -24
- dsplib/utils/onto_create_lists.py +0 -73
- dsplib/utils/onto_create_ontology.py +0 -442
- dsplib/utils/onto_get.py +0 -58
- dsplib/utils/onto_validate.py +0 -33
- dsplib/utils/xml_upload.py +0 -539
- dsplib/widgets/doublepassword.py +0 -80
- knora/MLS-import-libraries.py +0 -84
- knora/dsp_tools.py +0 -96
- knora/dsplib/models/connection.py +0 -272
- knora/dsplib/models/group.py +0 -296
- knora/dsplib/models/helpers.py +0 -506
- knora/dsplib/models/langstring.py +0 -277
- knora/dsplib/models/listnode.py +0 -578
- knora/dsplib/models/model.py +0 -20
- knora/dsplib/models/ontology.py +0 -448
- knora/dsplib/models/permission.py +0 -112
- knora/dsplib/models/project.py +0 -583
- knora/dsplib/models/propertyclass.py +0 -505
- knora/dsplib/models/resource.py +0 -416
- knora/dsplib/models/resourceclass.py +0 -811
- knora/dsplib/models/sipi.py +0 -35
- knora/dsplib/models/user.py +0 -731
- knora/dsplib/models/value.py +0 -1000
- knora/dsplib/utils/knora-data-schema.xsd +0 -464
- knora/dsplib/utils/knora-schema-lists.json +0 -83
- knora/dsplib/utils/knora-schema.json +0 -444
- knora/dsplib/utils/onto_commons.py +0 -24
- knora/dsplib/utils/onto_create_lists.py +0 -73
- knora/dsplib/utils/onto_create_ontology.py +0 -451
- knora/dsplib/utils/onto_get.py +0 -58
- knora/dsplib/utils/onto_validate.py +0 -33
- knora/dsplib/utils/xml_upload.py +0 -540
- knora/dsplib/widgets/doublepassword.py +0 -80
- knora/knora.py +0 -2108
- knora/test.py +0 -99
- knora/testit.py +0 -76
- knora/xml2knora.py +0 -633
- {dsplib → dsp_tools/cli}/__init__.py +0 -0
- {dsplib/models → dsp_tools/clients}/__init__.py +0 -0
- {dsplib/utils → dsp_tools/commands}/__init__.py +0 -0
- {dsplib/widgets → dsp_tools/commands/create}/__init__.py +0 -0
- {knora → dsp_tools/commands/create/create_on_server}/__init__.py +0 -0
- {knora/dsplib → dsp_tools/commands/create/models}/__init__.py +0 -0
- {knora/dsplib/models → dsp_tools/commands/create/parsing}/__init__.py +0 -0
- {knora/dsplib/utils → dsp_tools/commands/create/serialisation}/__init__.py +0 -0
- {knora/dsplib/widgets → dsp_tools/commands/excel2json}/__init__.py +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
import warnings
|
|
4
|
+
from collections.abc import Mapping
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import regex
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from lxml import etree
|
|
11
|
+
|
|
12
|
+
from dsp_tools.error.custom_warnings import DspToolsUserWarning
|
|
13
|
+
from dsp_tools.error.exceptions import InputError
|
|
14
|
+
from dsp_tools.utils.xml_parsing.parse_clean_validate_xml import parse_and_clean_xml_file
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _check_input_parameters(
|
|
18
|
+
xml_file: str,
|
|
19
|
+
json_file: str,
|
|
20
|
+
) -> tuple[Path, Path]:
|
|
21
|
+
"""
|
|
22
|
+
Transform the input parameters into Path objects
|
|
23
|
+
and check if they are valid files.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
xml_file: the XML file with the data to be replaced
|
|
27
|
+
json_file: the JSON file with the mapping (dict) of internal IDs to IRIs
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
InputError: if one of the files could not be found
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
path objects of the input parameters
|
|
34
|
+
"""
|
|
35
|
+
xml_file_as_path = Path(xml_file)
|
|
36
|
+
if not xml_file_as_path.is_file():
|
|
37
|
+
logger.error(f"File {xml_file} could not be found.")
|
|
38
|
+
raise InputError(f"File {xml_file} could not be found.")
|
|
39
|
+
|
|
40
|
+
json_file_as_path = Path(json_file)
|
|
41
|
+
if not json_file_as_path.is_file():
|
|
42
|
+
logger.error(f"File {json_file} could not be found.")
|
|
43
|
+
raise InputError(f"File {json_file} could not be found.")
|
|
44
|
+
|
|
45
|
+
return xml_file_as_path, json_file_as_path
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _parse_json_file(json_file: Path) -> dict[str, str]:
|
|
49
|
+
"""
|
|
50
|
+
Read JSON file and parse it into a dictionary.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
json_file: path to JSON file
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
dictionary with the contents of the JSON file
|
|
57
|
+
"""
|
|
58
|
+
with open(json_file, encoding="utf-8", mode="r") as file:
|
|
59
|
+
mapping: dict[str, str] = json.load(file)
|
|
60
|
+
return mapping
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _replace_resptrs(
|
|
64
|
+
tree: etree._Element,
|
|
65
|
+
mapping: Mapping[str, str],
|
|
66
|
+
used_mapping_entries: set[str],
|
|
67
|
+
) -> tuple[etree._Element, set[str]]:
|
|
68
|
+
"""
|
|
69
|
+
Replace the internal IDs in the `<resptr>` tags by IRIs.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
tree: parsed XML file
|
|
73
|
+
mapping: mapping of internal IDs to IRIs
|
|
74
|
+
used_mapping_entries: IDs of the mapping that have been found in the XML and have been replaced
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
a tuple of the modified copy of the XML tree, and the set of the IDs that have been replaced
|
|
78
|
+
"""
|
|
79
|
+
modified_tree = copy.deepcopy(tree)
|
|
80
|
+
xpaths = [f"/knora/{x}/resptr-prop/resptr" for x in ["resource", "link", "region"]]
|
|
81
|
+
xpaths.extend([f"/knora/{x}-segment/isSegmentOf" for x in ["video", "audio"]])
|
|
82
|
+
xpaths.extend([f"/knora/{x}-segment/relatesTo" for x in ["video", "audio"]])
|
|
83
|
+
resptr_xpath = "|".join(xpaths)
|
|
84
|
+
resptr_elems = modified_tree.xpath(resptr_xpath)
|
|
85
|
+
resptr_elems_replaced = 0
|
|
86
|
+
for resptr_elem in resptr_elems:
|
|
87
|
+
value_before = resptr_elem.text
|
|
88
|
+
if value_after := mapping.get(value_before):
|
|
89
|
+
resptr_elem.text = value_after
|
|
90
|
+
resptr_elems_replaced += 1
|
|
91
|
+
used_mapping_entries.add(value_before)
|
|
92
|
+
|
|
93
|
+
logger.info(f"Replaced {resptr_elems_replaced}/{len(resptr_elems)} resptr links in the XML file")
|
|
94
|
+
print(f"Replaced {resptr_elems_replaced}/{len(resptr_elems)} resptr links in the XML file")
|
|
95
|
+
|
|
96
|
+
return modified_tree, used_mapping_entries
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _replace_salsah_links(
|
|
100
|
+
tree: etree._Element,
|
|
101
|
+
mapping: Mapping[str, str],
|
|
102
|
+
used_mapping_entries: set[str],
|
|
103
|
+
) -> tuple[etree._Element, set[str]]:
|
|
104
|
+
"""
|
|
105
|
+
Replace the internal IDs in the salsah-links of the `<text>` tags by IRIs.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
tree: parsed XML file
|
|
109
|
+
mapping: mapping of internal IDs to IRIs
|
|
110
|
+
used_mapping_entries: IDs of the mapping that have been found in the XML and have been replaced
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
a tuple of the modified copy of the XML tree, and the set of the IDs that have been replaced
|
|
114
|
+
"""
|
|
115
|
+
modified_tree = copy.deepcopy(tree)
|
|
116
|
+
xpaths = [f"/knora/{x}/text-prop/text//a" for x in ["resource", "link", "region"]]
|
|
117
|
+
xpaths.extend([f"/knora/{x}-segment/hasComment//a" for x in ["video", "audio"]])
|
|
118
|
+
xpaths.extend([f"/knora/{x}-segment/hasDescription//a" for x in ["video", "audio"]])
|
|
119
|
+
salsah_xpath = "|".join(xpaths)
|
|
120
|
+
salsah_links = [x for x in modified_tree.xpath(salsah_xpath) if x.attrib.get("class") == "salsah-link"]
|
|
121
|
+
salsah_links_replaced = 0
|
|
122
|
+
for salsah_link in salsah_links:
|
|
123
|
+
value_before = regex.sub("IRI:|:IRI", "", salsah_link.attrib.get("href", ""))
|
|
124
|
+
if value_after := mapping.get(value_before):
|
|
125
|
+
salsah_link.attrib["href"] = value_after
|
|
126
|
+
salsah_links_replaced += 1
|
|
127
|
+
used_mapping_entries.add(value_before)
|
|
128
|
+
|
|
129
|
+
logger.info(f"Replaced {salsah_links_replaced}/{len(salsah_links)} salsah-links in the XML file")
|
|
130
|
+
print(f"Replaced {salsah_links_replaced}/{len(salsah_links)} salsah-links in the XML file")
|
|
131
|
+
|
|
132
|
+
return modified_tree, used_mapping_entries
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _replace_ids_by_iris(
|
|
136
|
+
tree: etree._Element,
|
|
137
|
+
mapping: Mapping[str, str],
|
|
138
|
+
) -> etree._Element:
|
|
139
|
+
"""
|
|
140
|
+
Iterate over the `<resptr>` tags and the salsah-links of the `<text>` tags,
|
|
141
|
+
and replace the internal IDs by IRIs.
|
|
142
|
+
If an internal ID cannot be found in the mapping, the original ID is kept.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
tree: parsed XML file
|
|
146
|
+
mapping: mapping of internal IDs to IRIs
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
a modified copy of the XML tree
|
|
150
|
+
"""
|
|
151
|
+
used_mapping_entries: set[str] = set()
|
|
152
|
+
|
|
153
|
+
tree, used_mapping_entries = _replace_resptrs(
|
|
154
|
+
tree=tree,
|
|
155
|
+
mapping=mapping,
|
|
156
|
+
used_mapping_entries=used_mapping_entries,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
tree, used_mapping_entries = _replace_salsah_links(
|
|
160
|
+
tree=tree,
|
|
161
|
+
mapping=mapping,
|
|
162
|
+
used_mapping_entries=used_mapping_entries,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
logger.info(f"Used {len(used_mapping_entries)}/{len(mapping)} entries from the mapping file")
|
|
166
|
+
print(f"Used {len(used_mapping_entries)}/{len(mapping)} entries from the mapping file")
|
|
167
|
+
|
|
168
|
+
return tree
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _remove_resources_if_id_in_mapping(
|
|
172
|
+
tree: etree._Element,
|
|
173
|
+
mapping: Mapping[str, str],
|
|
174
|
+
) -> etree._Element:
|
|
175
|
+
"""
|
|
176
|
+
Remove all resources from the XML file if their ID is in the mapping.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
tree: parsed XML file
|
|
180
|
+
mapping: mapping of internal IDs to IRIs
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
a modified copy of the XML tree
|
|
184
|
+
"""
|
|
185
|
+
modified_tree = copy.deepcopy(tree)
|
|
186
|
+
resources = modified_tree.xpath(
|
|
187
|
+
"|".join([f"/knora/{x}" for x in ["resource", "link", "region", "video-segment", "audio-segment"]])
|
|
188
|
+
)
|
|
189
|
+
resources_to_remove = [x for x in resources if x.attrib.get("id") in mapping]
|
|
190
|
+
for resource in resources_to_remove:
|
|
191
|
+
resource.getparent().remove(resource)
|
|
192
|
+
|
|
193
|
+
msg = (
|
|
194
|
+
f"Removed {len(resources_to_remove)}/{len(resources)} resources from the XML file, "
|
|
195
|
+
"because their ID was in the mapping"
|
|
196
|
+
)
|
|
197
|
+
logger.warning(msg)
|
|
198
|
+
warnings.warn(DspToolsUserWarning(msg))
|
|
199
|
+
|
|
200
|
+
return modified_tree
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _write_output_file(
|
|
204
|
+
orig_xml_file: Path,
|
|
205
|
+
tree: etree._Element,
|
|
206
|
+
) -> None:
|
|
207
|
+
"""
|
|
208
|
+
Write modified XML file with replaced IDs to disk.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
orig_xml_file: XML file that was provided as input
|
|
212
|
+
tree: modified XML tree with replaced IDs
|
|
213
|
+
"""
|
|
214
|
+
timestamp_str = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
215
|
+
out_file = f"{orig_xml_file.stem}_replaced_{timestamp_str}.xml"
|
|
216
|
+
et = etree.ElementTree(tree)
|
|
217
|
+
et.write(out_file, pretty_print=True, xml_declaration=True, encoding="utf-8")
|
|
218
|
+
logger.info(f"XML with replaced IDs was written to file {out_file}.")
|
|
219
|
+
print(f"XML with replaced IDs was written to file {out_file}.")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def id2iri(
|
|
223
|
+
xml_file: str,
|
|
224
|
+
json_file: str,
|
|
225
|
+
remove_resource_if_id_in_mapping: bool = False,
|
|
226
|
+
) -> bool:
|
|
227
|
+
"""
|
|
228
|
+
Replace internal IDs of an XML file
|
|
229
|
+
(`<resptr>` tags and salsah-links inside `<text>` tags)
|
|
230
|
+
by IRIs provided in a mapping file.
|
|
231
|
+
If an internal ID cannot be found in the mapping, the original ID is kept.
|
|
232
|
+
The output is written to a new XML file named "[original name]_replaced_[timestamp].xml".
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
xml_file: the XML file with the data to be replaced
|
|
236
|
+
json_file: the JSON file with the mapping (dict) of internal IDs to IRIs
|
|
237
|
+
remove_resource_if_id_in_mapping: if True, remove all resources from the XML file if their ID is in the mapping
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
BaseError: if one of the two input files is not a valid file
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
success status
|
|
244
|
+
"""
|
|
245
|
+
xml_file_as_path, json_file_as_path = _check_input_parameters(xml_file=xml_file, json_file=json_file)
|
|
246
|
+
mapping = _parse_json_file(json_file_as_path)
|
|
247
|
+
tree = parse_and_clean_xml_file(xml_file_as_path)
|
|
248
|
+
tree = _replace_ids_by_iris(
|
|
249
|
+
tree=tree,
|
|
250
|
+
mapping=mapping,
|
|
251
|
+
)
|
|
252
|
+
if remove_resource_if_id_in_mapping:
|
|
253
|
+
tree = _remove_resources_if_id_in_mapping(
|
|
254
|
+
tree=tree,
|
|
255
|
+
mapping=mapping,
|
|
256
|
+
)
|
|
257
|
+
_write_output_file(orig_xml_file=xml_file_as_path, tree=tree)
|
|
258
|
+
return True
|
|
File without changes
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import urllib.parse
|
|
2
|
+
from collections.abc import Iterator
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from dataclasses import field
|
|
5
|
+
from http import HTTPStatus
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import regex
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from requests import JSONDecodeError
|
|
11
|
+
from requests import RequestException
|
|
12
|
+
from requests import Session
|
|
13
|
+
from requests.adapters import HTTPAdapter
|
|
14
|
+
from requests.adapters import Retry
|
|
15
|
+
|
|
16
|
+
from dsp_tools.clients.authentication_client import AuthenticationClient
|
|
17
|
+
from dsp_tools.commands.ingest_xmlupload.upload_files.upload_failures import UploadFailure
|
|
18
|
+
from dsp_tools.config.logger_config import LOGGER_SAVEPATH
|
|
19
|
+
from dsp_tools.error.exceptions import BadCredentialsError
|
|
20
|
+
from dsp_tools.error.exceptions import InputError
|
|
21
|
+
from dsp_tools.utils.request_utils import RequestParameters
|
|
22
|
+
from dsp_tools.utils.request_utils import log_request
|
|
23
|
+
from dsp_tools.utils.request_utils import log_response
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class BulkIngestClient:
|
|
28
|
+
"""Client to upload multiple files to the ingest server and monitor the ingest process."""
|
|
29
|
+
|
|
30
|
+
dsp_ingest_url: str
|
|
31
|
+
authentication_client: AuthenticationClient
|
|
32
|
+
shortcode: str
|
|
33
|
+
imgdir: Path = field(default=Path.cwd())
|
|
34
|
+
session: Session = field(init=False)
|
|
35
|
+
retrieval_failures = 0
|
|
36
|
+
|
|
37
|
+
def __post_init__(self) -> None:
|
|
38
|
+
retries = 6
|
|
39
|
+
self.session = Session()
|
|
40
|
+
retry = Retry(
|
|
41
|
+
total=retries,
|
|
42
|
+
read=retries,
|
|
43
|
+
connect=retries,
|
|
44
|
+
backoff_factor=0.3,
|
|
45
|
+
allowed_methods=None, # means all methods
|
|
46
|
+
status_forcelist=[HTTPStatus.INTERNAL_SERVER_ERROR, HTTPStatus.SERVICE_UNAVAILABLE],
|
|
47
|
+
)
|
|
48
|
+
adapter = HTTPAdapter(max_retries=retry)
|
|
49
|
+
self.session.mount("http://", adapter)
|
|
50
|
+
self.session.mount("https://", adapter)
|
|
51
|
+
|
|
52
|
+
def upload_file(
|
|
53
|
+
self,
|
|
54
|
+
filepath: Path,
|
|
55
|
+
) -> UploadFailure | None:
|
|
56
|
+
"""
|
|
57
|
+
Uploads a file to the ingest server.
|
|
58
|
+
The load balancer on DSP servers currently has a timeout of 10m,
|
|
59
|
+
so we need to use a slightly shorter timeout of 9m.
|
|
60
|
+
See https://linear.app/dasch/issue/INFRA-847/increase-traefik-readtimeout
|
|
61
|
+
# noqa: DAR101
|
|
62
|
+
# noqa: DAR201
|
|
63
|
+
"""
|
|
64
|
+
logger.debug(f"Uploading file '{filepath}'")
|
|
65
|
+
timeout = 9 * 60
|
|
66
|
+
url = self._build_url_for_bulk_ingest_ingest_route(filepath)
|
|
67
|
+
headers = {
|
|
68
|
+
"Content-Type": "application/octet-stream",
|
|
69
|
+
"Authorization": f"Bearer {self.authentication_client.get_token()}",
|
|
70
|
+
}
|
|
71
|
+
err_msg = f"Failed to upload '{filepath}' to '{url}'."
|
|
72
|
+
params = RequestParameters("POST", url, timeout, headers=headers)
|
|
73
|
+
log_request(params)
|
|
74
|
+
try:
|
|
75
|
+
with open(self.imgdir / filepath, "rb") as binary_io:
|
|
76
|
+
res = self.session.post(
|
|
77
|
+
url=params.url,
|
|
78
|
+
headers=params.headers,
|
|
79
|
+
data=binary_io, # https://requests.readthedocs.io/en/latest/user/advanced/#streaming-uploads
|
|
80
|
+
timeout=params.timeout,
|
|
81
|
+
)
|
|
82
|
+
log_response(res)
|
|
83
|
+
except RequestException as e:
|
|
84
|
+
logger.exception(err_msg)
|
|
85
|
+
return UploadFailure(filepath, f"Exception of requests library: {e}")
|
|
86
|
+
except OSError as e:
|
|
87
|
+
err_msg = f"Cannot bulk-ingest {filepath}, because the file could not be opened/read: {e.strerror}"
|
|
88
|
+
logger.error(err_msg)
|
|
89
|
+
return UploadFailure(filepath, err_msg)
|
|
90
|
+
if res.status_code != HTTPStatus.OK:
|
|
91
|
+
logger.error(f"{err_msg}: Response {res.status_code}: {res.text}")
|
|
92
|
+
return UploadFailure(filepath, res.reason, res.status_code, res.text)
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
def _build_url_for_bulk_ingest_ingest_route(self, filepath: Path) -> str:
|
|
96
|
+
"""
|
|
97
|
+
Remove the leading slash of absolute filepaths,
|
|
98
|
+
because the `/project/<shortcode>/bulk-ingest/ingest` route only accepts relative paths.
|
|
99
|
+
The leading slash has to be added again in the "ingest-xmlupload" step, when applying the ingest ID.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
filepath: filepath
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
url
|
|
106
|
+
"""
|
|
107
|
+
quoted = regex.sub(r"^%2F", "", urllib.parse.quote(str(filepath), safe=""))
|
|
108
|
+
return f"{self.dsp_ingest_url}/projects/{self.shortcode}/bulk-ingest/ingest/{quoted}"
|
|
109
|
+
|
|
110
|
+
def trigger_ingest_process(self) -> None:
|
|
111
|
+
"""Start the ingest process on the server."""
|
|
112
|
+
url = f"{self.dsp_ingest_url}/projects/{self.shortcode}/bulk-ingest"
|
|
113
|
+
timeout = 5
|
|
114
|
+
headers = {"Authorization": f"Bearer {self.authentication_client.get_token()}"}
|
|
115
|
+
params = RequestParameters("POST", url, timeout, headers=headers)
|
|
116
|
+
log_request(params)
|
|
117
|
+
res = self.session.post(params.url, timeout=params.timeout, headers=params.headers)
|
|
118
|
+
log_response(res)
|
|
119
|
+
if res.status_code == HTTPStatus.FORBIDDEN:
|
|
120
|
+
raise BadCredentialsError("Only ProjectAdmins or SystemAdmins can start the ingest process.")
|
|
121
|
+
if res.status_code == HTTPStatus.NOT_FOUND:
|
|
122
|
+
raise InputError(
|
|
123
|
+
f"No assets have been uploaded for project {self.shortcode}. "
|
|
124
|
+
"Before using the 'ingest-files' command, you must upload some files with the 'upload-files' command."
|
|
125
|
+
)
|
|
126
|
+
if res.status_code == HTTPStatus.CONFLICT:
|
|
127
|
+
msg = f"Ingest process on the server {self.dsp_ingest_url} is already running. Wait until it completes..."
|
|
128
|
+
print(msg)
|
|
129
|
+
logger.info(msg)
|
|
130
|
+
return
|
|
131
|
+
if res.status_code in [HTTPStatus.INTERNAL_SERVER_ERROR, HTTPStatus.SERVICE_UNAVAILABLE]:
|
|
132
|
+
raise InputError("Server is unavailable. Please try again later.")
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
returned_shortcode = res.json().get("id")
|
|
136
|
+
failed: bool = returned_shortcode != self.shortcode
|
|
137
|
+
except JSONDecodeError:
|
|
138
|
+
failed = True
|
|
139
|
+
if failed:
|
|
140
|
+
raise InputError("Failed to trigger the ingest process. Please check the server logs, or try again later.")
|
|
141
|
+
print(f"Kicked off the ingest process on the server {self.dsp_ingest_url}. Wait until it completes...")
|
|
142
|
+
logger.info(f"Kicked off the ingest process on the server {self.dsp_ingest_url}. Wait until it completes...")
|
|
143
|
+
|
|
144
|
+
def retrieve_mapping_generator(self) -> Iterator[str | bool]:
|
|
145
|
+
"""
|
|
146
|
+
Try to retrieve the mapping CSV from the server.
|
|
147
|
+
|
|
148
|
+
Yields:
|
|
149
|
+
True if the ingest process is still running.
|
|
150
|
+
False if there is a server error.
|
|
151
|
+
The mapping CSV if the ingest process has completed.
|
|
152
|
+
|
|
153
|
+
Raises:
|
|
154
|
+
InputError: if there are too many server errors in a row.
|
|
155
|
+
"""
|
|
156
|
+
url = f"{self.dsp_ingest_url}/projects/{self.shortcode}/bulk-ingest/mapping.csv"
|
|
157
|
+
timeout = 5
|
|
158
|
+
while True:
|
|
159
|
+
headers = {"Authorization": f"Bearer {self.authentication_client.get_token()}"}
|
|
160
|
+
params = RequestParameters("GET", url, timeout, headers=headers)
|
|
161
|
+
log_request(params)
|
|
162
|
+
res = self.session.get(params.url, timeout=params.timeout, headers=params.headers)
|
|
163
|
+
log_response(res)
|
|
164
|
+
if res.status_code == HTTPStatus.CONFLICT:
|
|
165
|
+
self.retrieval_failures = 0
|
|
166
|
+
logger.info("Ingest process is still running. Wait until it completes...")
|
|
167
|
+
yield True
|
|
168
|
+
elif res.status_code != HTTPStatus.OK or not res.text.startswith("original,derivative"):
|
|
169
|
+
self.retrieval_failures += 1
|
|
170
|
+
if self.retrieval_failures > 15:
|
|
171
|
+
raise InputError(f"There were too many server errors. Please check the logs at {LOGGER_SAVEPATH}.")
|
|
172
|
+
msg = "While retrieving the mapping CSV, the server responded with an unexpected status code/content."
|
|
173
|
+
logger.error(msg)
|
|
174
|
+
yield False
|
|
175
|
+
else:
|
|
176
|
+
logger.info("Ingest process completed.")
|
|
177
|
+
break
|
|
178
|
+
yield res.content.decode("utf-8")
|
|
File without changes
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import cast
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from loguru import logger
|
|
9
|
+
from lxml import etree
|
|
10
|
+
|
|
11
|
+
from dsp_tools.commands.ingest_xmlupload.create_resources.user_information import IngestInformation
|
|
12
|
+
from dsp_tools.error.exceptions import InputError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_mapping_dict_from_file(shortcode: str) -> dict[str, str]:
|
|
16
|
+
"""
|
|
17
|
+
This functions returns the information to replace the original filepaths with the identifier from dsp-ingest.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
shortcode: Shortcode of the project
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
dictionary with original: identifier from dsp-ingest
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
InputError: if no file was found
|
|
27
|
+
"""
|
|
28
|
+
filepath = Path(f"mapping-{shortcode}.csv")
|
|
29
|
+
if not filepath.is_file():
|
|
30
|
+
raise InputError(f"No mapping CSV file was found at {filepath}.")
|
|
31
|
+
df = pd.read_csv(filepath)
|
|
32
|
+
msg = f"The file '{filepath}' is used to map the internal original filepaths to the internal image IDs."
|
|
33
|
+
print(msg)
|
|
34
|
+
logger.info(msg)
|
|
35
|
+
return dict(zip(df["original"].tolist(), df["derivative"].tolist()))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def replace_filepath_with_internal_filename(
|
|
39
|
+
xml_tree: etree._Element,
|
|
40
|
+
orig_path_2_asset_id: dict[str, str],
|
|
41
|
+
) -> tuple[etree._Element, IngestInformation]:
|
|
42
|
+
"""
|
|
43
|
+
Replace the original filepaths in the `<bitstream>` tags by the id filenames of the uploaded files.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
xml_tree: The parsed original XML tree
|
|
47
|
+
orig_path_2_asset_id: Mapping from original filenames to asset IDs from the mapping.csv
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
A copy of the XMl tree, with the replaced filepaths.
|
|
51
|
+
Message informing if all referenced files were uploaded or not.
|
|
52
|
+
"""
|
|
53
|
+
no_id_found = []
|
|
54
|
+
used_media_file_paths = []
|
|
55
|
+
new_tree = deepcopy(xml_tree)
|
|
56
|
+
for elem in new_tree.iter():
|
|
57
|
+
if not etree.QName(elem).localname.endswith("bitstream") or not elem.text:
|
|
58
|
+
continue
|
|
59
|
+
img_path_str = elem.text.strip()
|
|
60
|
+
if img_path_str not in orig_path_2_asset_id and img_path_str.startswith("/"):
|
|
61
|
+
img_path_str = img_path_str[1:]
|
|
62
|
+
if img_path_str in orig_path_2_asset_id:
|
|
63
|
+
elem.text = orig_path_2_asset_id[img_path_str]
|
|
64
|
+
used_media_file_paths.append(img_path_str)
|
|
65
|
+
else:
|
|
66
|
+
no_id_found.append((cast("etree._Element", elem.getparent()).attrib["id"], str(elem.text)))
|
|
67
|
+
|
|
68
|
+
unused_media_paths = [x for x in orig_path_2_asset_id if x not in used_media_file_paths]
|
|
69
|
+
return new_tree, IngestInformation(unused_mediafiles=unused_media_paths, mediafiles_no_id=no_id_found)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from loguru import logger
|
|
6
|
+
from lxml import etree
|
|
7
|
+
|
|
8
|
+
from dsp_tools.cli.args import ServerCredentials
|
|
9
|
+
from dsp_tools.cli.args import ValidateDataConfig
|
|
10
|
+
from dsp_tools.cli.args import ValidationSeverity
|
|
11
|
+
from dsp_tools.clients.authentication_client import AuthenticationClient
|
|
12
|
+
from dsp_tools.clients.authentication_client_live import AuthenticationClientLive
|
|
13
|
+
from dsp_tools.clients.connection import Connection
|
|
14
|
+
from dsp_tools.clients.connection_live import ConnectionLive
|
|
15
|
+
from dsp_tools.clients.legal_info_client_live import LegalInfoClientLive
|
|
16
|
+
from dsp_tools.clients.project_client_live import ProjectClientLive
|
|
17
|
+
from dsp_tools.commands.ingest_xmlupload.create_resources.apply_ingest_id import get_mapping_dict_from_file
|
|
18
|
+
from dsp_tools.commands.ingest_xmlupload.create_resources.apply_ingest_id import replace_filepath_with_internal_filename
|
|
19
|
+
from dsp_tools.commands.validate_data.validate_data import validate_parsed_resources
|
|
20
|
+
from dsp_tools.commands.xmlupload.models.ingest import BulkIngestedAssetClient
|
|
21
|
+
from dsp_tools.commands.xmlupload.models.upload_clients import UploadClients
|
|
22
|
+
from dsp_tools.commands.xmlupload.models.upload_state import UploadState
|
|
23
|
+
from dsp_tools.commands.xmlupload.prepare_xml_input.get_processed_resources import get_processed_resources
|
|
24
|
+
from dsp_tools.commands.xmlupload.prepare_xml_input.list_client import ListClientLive
|
|
25
|
+
from dsp_tools.commands.xmlupload.prepare_xml_input.prepare_xml_input import get_parsed_resources_and_mappers
|
|
26
|
+
from dsp_tools.commands.xmlupload.prepare_xml_input.prepare_xml_input import get_stash_and_upload_order
|
|
27
|
+
from dsp_tools.commands.xmlupload.prepare_xml_input.read_validate_xml_file import validate_iiif_uris
|
|
28
|
+
from dsp_tools.commands.xmlupload.upload_config import UploadConfig
|
|
29
|
+
from dsp_tools.commands.xmlupload.xmlupload import enable_unknown_license_if_any_are_missing
|
|
30
|
+
from dsp_tools.commands.xmlupload.xmlupload import execute_upload
|
|
31
|
+
from dsp_tools.error.exceptions import InputError
|
|
32
|
+
from dsp_tools.utils.ansi_colors import BOLD_RED
|
|
33
|
+
from dsp_tools.utils.ansi_colors import RESET_TO_DEFAULT
|
|
34
|
+
from dsp_tools.utils.data_formats.uri_util import is_prod_like_server
|
|
35
|
+
from dsp_tools.utils.replace_id_with_iri import use_id2iri_mapping_to_replace_ids
|
|
36
|
+
from dsp_tools.utils.xml_parsing.parse_clean_validate_xml import parse_and_clean_xml_file
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def ingest_xmlupload(
|
|
40
|
+
xml_file: Path,
|
|
41
|
+
creds: ServerCredentials,
|
|
42
|
+
interrupt_after: int | None = None,
|
|
43
|
+
skip_validation: bool = False,
|
|
44
|
+
skip_ontology_validation: bool = False,
|
|
45
|
+
id2iri_file: str | None = None,
|
|
46
|
+
do_not_request_resource_metadata_from_db: bool = False,
|
|
47
|
+
) -> bool:
|
|
48
|
+
"""
|
|
49
|
+
This function reads an XML file
|
|
50
|
+
and imports the data described in it onto the DSP server,
|
|
51
|
+
using the ingest XML upload method.
|
|
52
|
+
Before using this function,
|
|
53
|
+
the multimedia files must be ingested on the DSP server.
|
|
54
|
+
A mapping file with the internal IDs of the multimedia files must also be provided.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
xml_file: path to XML file containing the resources
|
|
58
|
+
creds: credentials to access the DSP server
|
|
59
|
+
interrupt_after: if set, the upload will be interrupted after this number of resources
|
|
60
|
+
skip_validation: skip the SHACL validation
|
|
61
|
+
skip_ontology_validation: skip the ontology validation
|
|
62
|
+
id2iri_file: to replace internal IDs of an XML file by IRIs provided in this mapping file
|
|
63
|
+
do_not_request_resource_metadata_from_db: if true do not request metadata information from the api
|
|
64
|
+
for existing resources
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
True if all resources could be uploaded without errors; False if one of the resources could not be
|
|
68
|
+
uploaded because there is an error in it
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
InputError: if any media was not uploaded or uploaded media was not referenced.
|
|
72
|
+
"""
|
|
73
|
+
root = parse_and_clean_xml_file(xml_file)
|
|
74
|
+
shortcode = root.attrib["shortcode"]
|
|
75
|
+
root = _replace_filepaths_with_internal_filename_from_ingest(root, shortcode)
|
|
76
|
+
|
|
77
|
+
auth = AuthenticationClientLive(server=creds.server, email=creds.user, password=creds.password)
|
|
78
|
+
con = ConnectionLive(creds.server, auth)
|
|
79
|
+
config = UploadConfig(
|
|
80
|
+
media_previously_uploaded=True,
|
|
81
|
+
interrupt_after=interrupt_after,
|
|
82
|
+
).with_server_info(
|
|
83
|
+
server=creds.server,
|
|
84
|
+
shortcode=shortcode,
|
|
85
|
+
)
|
|
86
|
+
clients = _get_live_clients(con, config, auth)
|
|
87
|
+
|
|
88
|
+
parsed_resources, lookups = get_parsed_resources_and_mappers(root, clients)
|
|
89
|
+
if id2iri_file:
|
|
90
|
+
parsed_resources = use_id2iri_mapping_to_replace_ids(parsed_resources, Path(id2iri_file))
|
|
91
|
+
|
|
92
|
+
validation_should_be_skipped = skip_validation
|
|
93
|
+
is_on_prod_like_server = is_prod_like_server(creds.server)
|
|
94
|
+
if is_on_prod_like_server and config.skip_validation:
|
|
95
|
+
msg = (
|
|
96
|
+
"You set the flag '--skip-validation' to circumvent the SHACL schema validation. "
|
|
97
|
+
"This means that the upload may fail due to undetected errors. "
|
|
98
|
+
"Do you wish to skip the validation (yes/no)? "
|
|
99
|
+
)
|
|
100
|
+
resp = ""
|
|
101
|
+
while resp not in ["yes", "no"]:
|
|
102
|
+
resp = input(BOLD_RED + msg + RESET_TO_DEFAULT)
|
|
103
|
+
if str(resp) == "no":
|
|
104
|
+
validation_should_be_skipped = False
|
|
105
|
+
if not validation_should_be_skipped:
|
|
106
|
+
v_severity = config.validation_severity
|
|
107
|
+
if is_on_prod_like_server:
|
|
108
|
+
v_severity = ValidationSeverity.INFO
|
|
109
|
+
validation_passed = validate_parsed_resources(
|
|
110
|
+
parsed_resources=parsed_resources,
|
|
111
|
+
authorship_lookup=lookups.authorships,
|
|
112
|
+
permission_ids=list(lookups.permissions.keys()),
|
|
113
|
+
shortcode=shortcode,
|
|
114
|
+
config=ValidateDataConfig(
|
|
115
|
+
xml_file,
|
|
116
|
+
save_graph_dir=None,
|
|
117
|
+
severity=v_severity,
|
|
118
|
+
ignore_duplicate_files_warning=True,
|
|
119
|
+
is_on_prod_server=is_on_prod_like_server,
|
|
120
|
+
skip_ontology_validation=skip_ontology_validation,
|
|
121
|
+
do_not_request_resource_metadata_from_db=do_not_request_resource_metadata_from_db,
|
|
122
|
+
),
|
|
123
|
+
auth=auth,
|
|
124
|
+
)
|
|
125
|
+
if not validation_passed:
|
|
126
|
+
return False
|
|
127
|
+
else:
|
|
128
|
+
logger.debug("SHACL validation was skipped.")
|
|
129
|
+
|
|
130
|
+
if not config.skip_iiif_validation:
|
|
131
|
+
validate_iiif_uris(root)
|
|
132
|
+
|
|
133
|
+
if not is_on_prod_like_server:
|
|
134
|
+
enable_unknown_license_if_any_are_missing(clients.legal_info_client, parsed_resources)
|
|
135
|
+
|
|
136
|
+
processed_resources = get_processed_resources(parsed_resources, lookups, is_on_prod_like_server)
|
|
137
|
+
|
|
138
|
+
sorted_resources, stash = get_stash_and_upload_order(processed_resources)
|
|
139
|
+
|
|
140
|
+
state = UploadState(
|
|
141
|
+
pending_resources=sorted_resources,
|
|
142
|
+
pending_stash=stash,
|
|
143
|
+
config=config,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return execute_upload(clients, state)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _replace_filepaths_with_internal_filename_from_ingest(root: etree._Element, shortcode: str) -> etree._Element:
|
|
150
|
+
orig_path_2_asset_id = get_mapping_dict_from_file(shortcode)
|
|
151
|
+
root, ingest_info = replace_filepath_with_internal_filename(root, orig_path_2_asset_id)
|
|
152
|
+
if ok := ingest_info.ok_msg():
|
|
153
|
+
print(ok)
|
|
154
|
+
logger.info(ok)
|
|
155
|
+
else:
|
|
156
|
+
err_msg = ingest_info.execute_error_protocol()
|
|
157
|
+
raise InputError(err_msg)
|
|
158
|
+
return root
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _get_live_clients(con: Connection, config: UploadConfig, auth: AuthenticationClient) -> UploadClients:
|
|
162
|
+
ingest_client = BulkIngestedAssetClient()
|
|
163
|
+
project_client = ProjectClientLive(auth.server, auth)
|
|
164
|
+
list_client = ListClientLive(con, project_client.get_project_iri(config.shortcode))
|
|
165
|
+
legal_info_client = LegalInfoClientLive(config.server, config.shortcode, auth)
|
|
166
|
+
return UploadClients(ingest_client, list_client, legal_info_client)
|