dsp-tools 0.9.13__py3-none-any.whl → 18.3.0.post13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsp_tools/__init__.py +5 -0
- dsp_tools/cli/args.py +47 -0
- dsp_tools/cli/call_action.py +85 -0
- dsp_tools/cli/call_action_files_only.py +101 -0
- dsp_tools/cli/call_action_with_network.py +207 -0
- dsp_tools/cli/create_parsers.py +479 -0
- dsp_tools/cli/entry_point.py +322 -0
- dsp_tools/cli/utils.py +87 -0
- dsp_tools/clients/CLAUDE.md +420 -0
- dsp_tools/clients/authentication_client.py +14 -0
- dsp_tools/clients/authentication_client_live.py +66 -0
- dsp_tools/clients/connection.py +35 -0
- dsp_tools/clients/connection_live.py +233 -0
- dsp_tools/clients/fuseki_metrics.py +60 -0
- dsp_tools/clients/group_user_clients.py +35 -0
- dsp_tools/clients/group_user_clients_live.py +181 -0
- dsp_tools/clients/legal_info_client.py +23 -0
- dsp_tools/clients/legal_info_client_live.py +132 -0
- dsp_tools/clients/list_client.py +49 -0
- dsp_tools/clients/list_client_live.py +166 -0
- dsp_tools/clients/metadata_client.py +24 -0
- dsp_tools/clients/metadata_client_live.py +47 -0
- dsp_tools/clients/ontology_clients.py +49 -0
- dsp_tools/clients/ontology_create_client_live.py +166 -0
- dsp_tools/clients/ontology_get_client_live.py +80 -0
- dsp_tools/clients/permissions_client.py +68 -0
- dsp_tools/clients/project_client.py +16 -0
- dsp_tools/clients/project_client_live.py +66 -0
- dsp_tools/commands/create/communicate_problems.py +24 -0
- dsp_tools/commands/create/create.py +134 -0
- dsp_tools/commands/create/create_on_server/cardinalities.py +111 -0
- dsp_tools/commands/create/create_on_server/classes.py +99 -0
- dsp_tools/commands/create/create_on_server/complete_ontologies.py +116 -0
- dsp_tools/commands/create/create_on_server/default_permissions.py +134 -0
- dsp_tools/commands/create/create_on_server/group_users.py +165 -0
- dsp_tools/commands/create/create_on_server/lists.py +163 -0
- dsp_tools/commands/create/create_on_server/mappers.py +12 -0
- dsp_tools/commands/create/create_on_server/onto_utils.py +74 -0
- dsp_tools/commands/create/create_on_server/ontology.py +52 -0
- dsp_tools/commands/create/create_on_server/project.py +68 -0
- dsp_tools/commands/create/create_on_server/properties.py +119 -0
- dsp_tools/commands/create/exceptions.py +29 -0
- dsp_tools/commands/create/lists_only.py +66 -0
- dsp_tools/commands/create/models/create_problems.py +87 -0
- dsp_tools/commands/create/models/parsed_ontology.py +88 -0
- dsp_tools/commands/create/models/parsed_project.py +81 -0
- dsp_tools/commands/create/models/rdf_ontology.py +12 -0
- dsp_tools/commands/create/models/server_project_info.py +100 -0
- dsp_tools/commands/create/parsing/parse_lists.py +45 -0
- dsp_tools/commands/create/parsing/parse_ontology.py +243 -0
- dsp_tools/commands/create/parsing/parse_project.py +149 -0
- dsp_tools/commands/create/parsing/parsing_utils.py +40 -0
- dsp_tools/commands/create/project_validate.py +595 -0
- dsp_tools/commands/create/serialisation/ontology.py +119 -0
- dsp_tools/commands/create/serialisation/project.py +44 -0
- dsp_tools/commands/excel2json/CLAUDE.md +101 -0
- dsp_tools/commands/excel2json/json_header.py +321 -0
- dsp_tools/commands/excel2json/lists/__init__.py +0 -0
- dsp_tools/commands/excel2json/lists/compliance_checks.py +292 -0
- dsp_tools/commands/excel2json/lists/make_lists.py +247 -0
- dsp_tools/commands/excel2json/lists/models/__init__.py +0 -0
- dsp_tools/commands/excel2json/lists/models/deserialise.py +30 -0
- dsp_tools/commands/excel2json/lists/models/input_error.py +216 -0
- dsp_tools/commands/excel2json/lists/models/serialise.py +57 -0
- dsp_tools/commands/excel2json/lists/utils.py +81 -0
- dsp_tools/commands/excel2json/models/__init__.py +0 -0
- dsp_tools/commands/excel2json/models/input_error.py +416 -0
- dsp_tools/commands/excel2json/models/json_header.py +175 -0
- dsp_tools/commands/excel2json/models/list_node_name.py +16 -0
- dsp_tools/commands/excel2json/models/ontology.py +76 -0
- dsp_tools/commands/excel2json/old_lists.py +328 -0
- dsp_tools/commands/excel2json/project.py +280 -0
- dsp_tools/commands/excel2json/properties.py +370 -0
- dsp_tools/commands/excel2json/resources.py +336 -0
- dsp_tools/commands/excel2json/utils.py +352 -0
- dsp_tools/commands/excel2xml/__init__.py +7 -0
- dsp_tools/commands/excel2xml/excel2xml_cli.py +523 -0
- dsp_tools/commands/excel2xml/excel2xml_lib.py +1953 -0
- dsp_tools/commands/excel2xml/propertyelement.py +47 -0
- dsp_tools/commands/get/__init__.py +0 -0
- dsp_tools/commands/get/get.py +166 -0
- dsp_tools/commands/get/get_permissions.py +257 -0
- dsp_tools/commands/get/get_permissions_legacy.py +89 -0
- dsp_tools/commands/get/legacy_models/__init__.py +0 -0
- dsp_tools/commands/get/legacy_models/context.py +318 -0
- dsp_tools/commands/get/legacy_models/group.py +241 -0
- dsp_tools/commands/get/legacy_models/helpers.py +47 -0
- dsp_tools/commands/get/legacy_models/listnode.py +390 -0
- dsp_tools/commands/get/legacy_models/model.py +12 -0
- dsp_tools/commands/get/legacy_models/ontology.py +324 -0
- dsp_tools/commands/get/legacy_models/project.py +366 -0
- dsp_tools/commands/get/legacy_models/propertyclass.py +417 -0
- dsp_tools/commands/get/legacy_models/resourceclass.py +676 -0
- dsp_tools/commands/get/legacy_models/user.py +438 -0
- dsp_tools/commands/get/models/__init__.py +0 -0
- dsp_tools/commands/get/models/permissions_models.py +10 -0
- dsp_tools/commands/id2iri.py +258 -0
- dsp_tools/commands/ingest_xmlupload/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/bulk_ingest_client.py +178 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/apply_ingest_id.py +69 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/upload_xml.py +166 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/user_information.py +121 -0
- dsp_tools/commands/ingest_xmlupload/ingest_files/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/ingest_files/ingest_files.py +64 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/filechecker.py +20 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py +57 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_failures.py +66 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_files.py +67 -0
- dsp_tools/commands/resume_xmlupload/__init__.py +0 -0
- dsp_tools/commands/resume_xmlupload/resume_xmlupload.py +96 -0
- dsp_tools/commands/start_stack.py +428 -0
- dsp_tools/commands/update_legal/CLAUDE.md +344 -0
- dsp_tools/commands/update_legal/__init__.py +0 -0
- dsp_tools/commands/update_legal/core.py +182 -0
- dsp_tools/commands/update_legal/csv_operations.py +135 -0
- dsp_tools/commands/update_legal/models.py +87 -0
- dsp_tools/commands/update_legal/xml_operations.py +247 -0
- dsp_tools/commands/validate_data/CLAUDE.md +159 -0
- dsp_tools/commands/validate_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/constants.py +59 -0
- dsp_tools/commands/validate_data/mappers.py +143 -0
- dsp_tools/commands/validate_data/models/__init__.py +0 -0
- dsp_tools/commands/validate_data/models/api_responses.py +45 -0
- dsp_tools/commands/validate_data/models/input_problems.py +119 -0
- dsp_tools/commands/validate_data/models/rdf_like_data.py +117 -0
- dsp_tools/commands/validate_data/models/validation.py +106 -0
- dsp_tools/commands/validate_data/prepare_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/prepare_data/get_rdf_like_data.py +296 -0
- dsp_tools/commands/validate_data/prepare_data/make_data_graph.py +91 -0
- dsp_tools/commands/validate_data/prepare_data/prepare_data.py +184 -0
- dsp_tools/commands/validate_data/process_validation_report/__init__.py +0 -0
- dsp_tools/commands/validate_data/process_validation_report/get_user_validation_message.py +358 -0
- dsp_tools/commands/validate_data/process_validation_report/query_validation_result.py +507 -0
- dsp_tools/commands/validate_data/process_validation_report/reformat_validation_results.py +150 -0
- dsp_tools/commands/validate_data/shacl_cli_validator.py +70 -0
- dsp_tools/commands/validate_data/sparql/__init__.py +0 -0
- dsp_tools/commands/validate_data/sparql/cardinality_shacl.py +209 -0
- dsp_tools/commands/validate_data/sparql/construct_shacl.py +92 -0
- dsp_tools/commands/validate_data/sparql/legal_info_shacl.py +36 -0
- dsp_tools/commands/validate_data/sparql/value_shacl.py +357 -0
- dsp_tools/commands/validate_data/utils.py +59 -0
- dsp_tools/commands/validate_data/validate_data.py +283 -0
- dsp_tools/commands/validate_data/validation/__init__.py +0 -0
- dsp_tools/commands/validate_data/validation/check_duplicate_files.py +55 -0
- dsp_tools/commands/validate_data/validation/check_for_unknown_classes.py +67 -0
- dsp_tools/commands/validate_data/validation/get_validation_report.py +94 -0
- dsp_tools/commands/validate_data/validation/validate_ontology.py +107 -0
- dsp_tools/commands/xmlupload/CLAUDE.md +292 -0
- dsp_tools/commands/xmlupload/__init__.py +0 -0
- dsp_tools/commands/xmlupload/iri_resolver.py +21 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/__init__.py +0 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/constants.py +63 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/jsonld_utils.py +44 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_file_value.py +77 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_resource_and_values.py +114 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_values.py +262 -0
- dsp_tools/commands/xmlupload/models/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/bitstream_info.py +18 -0
- dsp_tools/commands/xmlupload/models/formatted_text_value.py +10 -0
- dsp_tools/commands/xmlupload/models/ingest.py +143 -0
- dsp_tools/commands/xmlupload/models/input_problems.py +58 -0
- dsp_tools/commands/xmlupload/models/lookup_models.py +21 -0
- dsp_tools/commands/xmlupload/models/permission.py +45 -0
- dsp_tools/commands/xmlupload/models/permissions_parsed.py +93 -0
- dsp_tools/commands/xmlupload/models/processed/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/processed/file_values.py +29 -0
- dsp_tools/commands/xmlupload/models/processed/res.py +27 -0
- dsp_tools/commands/xmlupload/models/processed/values.py +101 -0
- dsp_tools/commands/xmlupload/models/rdf_models.py +26 -0
- dsp_tools/commands/xmlupload/models/upload_clients.py +14 -0
- dsp_tools/commands/xmlupload/models/upload_state.py +20 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/__init__.py +0 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/ark2iri.py +55 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/get_processed_resources.py +252 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/iiif_uri_validator.py +50 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/list_client.py +120 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/prepare_xml_input.py +67 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/read_validate_xml_file.py +58 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/transform_input_values.py +118 -0
- dsp_tools/commands/xmlupload/resource_create_client.py +25 -0
- dsp_tools/commands/xmlupload/richtext_id2iri.py +37 -0
- dsp_tools/commands/xmlupload/stash/__init__.py +0 -0
- dsp_tools/commands/xmlupload/stash/analyse_circular_reference_graph.py +236 -0
- dsp_tools/commands/xmlupload/stash/create_info_for_graph.py +53 -0
- dsp_tools/commands/xmlupload/stash/graph_models.py +87 -0
- dsp_tools/commands/xmlupload/stash/stash_circular_references.py +68 -0
- dsp_tools/commands/xmlupload/stash/stash_models.py +109 -0
- dsp_tools/commands/xmlupload/stash/upload_stashed_resptr_props.py +106 -0
- dsp_tools/commands/xmlupload/stash/upload_stashed_xml_texts.py +196 -0
- dsp_tools/commands/xmlupload/upload_config.py +76 -0
- dsp_tools/commands/xmlupload/write_diagnostic_info.py +27 -0
- dsp_tools/commands/xmlupload/xmlupload.py +516 -0
- dsp_tools/config/__init__.py +0 -0
- dsp_tools/config/logger_config.py +69 -0
- dsp_tools/config/warnings_config.py +32 -0
- dsp_tools/error/__init__.py +0 -0
- dsp_tools/error/custom_warnings.py +39 -0
- dsp_tools/error/exceptions.py +204 -0
- dsp_tools/error/problems.py +10 -0
- dsp_tools/error/xmllib_errors.py +20 -0
- dsp_tools/error/xmllib_warnings.py +54 -0
- dsp_tools/error/xmllib_warnings_util.py +159 -0
- dsp_tools/error/xsd_validation_error_msg.py +19 -0
- dsp_tools/legacy_models/__init__.py +0 -0
- dsp_tools/legacy_models/datetimestamp.py +81 -0
- dsp_tools/legacy_models/langstring.py +253 -0
- dsp_tools/legacy_models/projectContext.py +49 -0
- dsp_tools/py.typed +0 -0
- dsp_tools/resources/schema/data.xsd +648 -0
- dsp_tools/resources/schema/lists-only.json +72 -0
- dsp_tools/resources/schema/project.json +1258 -0
- dsp_tools/resources/schema/properties-only.json +874 -0
- dsp_tools/resources/schema/resources-only.json +140 -0
- dsp_tools/resources/start-stack/docker-compose.override-host.j2 +11 -0
- dsp_tools/resources/start-stack/docker-compose.override.yml +11 -0
- dsp_tools/resources/start-stack/docker-compose.yml +88 -0
- dsp_tools/resources/start-stack/dsp-app-config.json +45 -0
- dsp_tools/resources/start-stack/dsp-app-config.override-host.j2 +26 -0
- dsp_tools/resources/validate_data/api-shapes-resource-cardinalities.ttl +191 -0
- dsp_tools/resources/validate_data/api-shapes.ttl +804 -0
- dsp_tools/resources/validate_data/shacl-cli-image.yml +4 -0
- dsp_tools/resources/validate_data/validate-ontology.ttl +99 -0
- dsp_tools/utils/__init__.py +0 -0
- dsp_tools/utils/ansi_colors.py +32 -0
- dsp_tools/utils/data_formats/__init__.py +0 -0
- dsp_tools/utils/data_formats/date_util.py +166 -0
- dsp_tools/utils/data_formats/iri_util.py +30 -0
- dsp_tools/utils/data_formats/shared.py +81 -0
- dsp_tools/utils/data_formats/uri_util.py +76 -0
- dsp_tools/utils/fuseki_bloating.py +63 -0
- dsp_tools/utils/json_parsing.py +22 -0
- dsp_tools/utils/rdf_constants.py +42 -0
- dsp_tools/utils/rdflib_utils.py +10 -0
- dsp_tools/utils/replace_id_with_iri.py +66 -0
- dsp_tools/utils/request_utils.py +238 -0
- dsp_tools/utils/xml_parsing/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/get_lookups.py +32 -0
- dsp_tools/utils/xml_parsing/get_parsed_resources.py +325 -0
- dsp_tools/utils/xml_parsing/models/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/models/parsed_resource.py +76 -0
- dsp_tools/utils/xml_parsing/parse_clean_validate_xml.py +137 -0
- dsp_tools/xmllib/CLAUDE.md +302 -0
- dsp_tools/xmllib/__init__.py +49 -0
- dsp_tools/xmllib/general_functions.py +877 -0
- dsp_tools/xmllib/internal/__init__.py +0 -0
- dsp_tools/xmllib/internal/checkers.py +162 -0
- dsp_tools/xmllib/internal/circumvent_circular_imports.py +36 -0
- dsp_tools/xmllib/internal/constants.py +46 -0
- dsp_tools/xmllib/internal/input_converters.py +155 -0
- dsp_tools/xmllib/internal/serialise_file_value.py +57 -0
- dsp_tools/xmllib/internal/serialise_resource.py +177 -0
- dsp_tools/xmllib/internal/serialise_values.py +152 -0
- dsp_tools/xmllib/internal/type_aliases.py +11 -0
- dsp_tools/xmllib/models/__init__.py +0 -0
- dsp_tools/xmllib/models/config_options.py +28 -0
- dsp_tools/xmllib/models/date_formats.py +48 -0
- dsp_tools/xmllib/models/dsp_base_resources.py +1542 -0
- dsp_tools/xmllib/models/internal/__init__.py +0 -0
- dsp_tools/xmllib/models/internal/file_values.py +172 -0
- dsp_tools/xmllib/models/internal/geometry.py +162 -0
- dsp_tools/xmllib/models/internal/migration_metadata.py +55 -0
- dsp_tools/xmllib/models/internal/serialise_permissions.py +66 -0
- dsp_tools/xmllib/models/internal/values.py +342 -0
- dsp_tools/xmllib/models/licenses/__init__.py +0 -0
- dsp_tools/xmllib/models/licenses/other.py +59 -0
- dsp_tools/xmllib/models/licenses/recommended.py +107 -0
- dsp_tools/xmllib/models/permissions.py +41 -0
- dsp_tools/xmllib/models/res.py +1782 -0
- dsp_tools/xmllib/models/root.py +348 -0
- dsp_tools/xmllib/value_checkers.py +434 -0
- dsp_tools/xmllib/value_converters.py +777 -0
- dsp_tools-18.3.0.post13.dist-info/METADATA +90 -0
- dsp_tools-18.3.0.post13.dist-info/RECORD +286 -0
- dsp_tools-18.3.0.post13.dist-info/WHEEL +4 -0
- dsp_tools-18.3.0.post13.dist-info/entry_points.txt +3 -0
- dsp_tools-0.9.13.dist-info/LICENSE +0 -674
- dsp_tools-0.9.13.dist-info/METADATA +0 -144
- dsp_tools-0.9.13.dist-info/RECORD +0 -71
- dsp_tools-0.9.13.dist-info/WHEEL +0 -5
- dsp_tools-0.9.13.dist-info/entry_points.txt +0 -3
- dsp_tools-0.9.13.dist-info/top_level.txt +0 -1
- dsplib/models/connection.py +0 -272
- dsplib/models/group.py +0 -296
- dsplib/models/helpers.py +0 -505
- dsplib/models/langstring.py +0 -277
- dsplib/models/listnode.py +0 -578
- dsplib/models/model.py +0 -20
- dsplib/models/ontology.py +0 -448
- dsplib/models/permission.py +0 -112
- dsplib/models/project.py +0 -547
- dsplib/models/propertyclass.py +0 -505
- dsplib/models/resource.py +0 -366
- dsplib/models/resourceclass.py +0 -810
- dsplib/models/sipi.py +0 -30
- dsplib/models/user.py +0 -731
- dsplib/models/value.py +0 -1000
- dsplib/utils/knora-data-schema.xsd +0 -454
- dsplib/utils/knora-schema-lists.json +0 -83
- dsplib/utils/knora-schema.json +0 -434
- dsplib/utils/onto_commons.py +0 -24
- dsplib/utils/onto_create_lists.py +0 -73
- dsplib/utils/onto_create_ontology.py +0 -442
- dsplib/utils/onto_get.py +0 -58
- dsplib/utils/onto_validate.py +0 -33
- dsplib/utils/xml_upload.py +0 -539
- dsplib/widgets/doublepassword.py +0 -80
- knora/MLS-import-libraries.py +0 -84
- knora/dsp_tools.py +0 -96
- knora/dsplib/models/connection.py +0 -272
- knora/dsplib/models/group.py +0 -296
- knora/dsplib/models/helpers.py +0 -506
- knora/dsplib/models/langstring.py +0 -277
- knora/dsplib/models/listnode.py +0 -578
- knora/dsplib/models/model.py +0 -20
- knora/dsplib/models/ontology.py +0 -448
- knora/dsplib/models/permission.py +0 -112
- knora/dsplib/models/project.py +0 -583
- knora/dsplib/models/propertyclass.py +0 -505
- knora/dsplib/models/resource.py +0 -416
- knora/dsplib/models/resourceclass.py +0 -811
- knora/dsplib/models/sipi.py +0 -35
- knora/dsplib/models/user.py +0 -731
- knora/dsplib/models/value.py +0 -1000
- knora/dsplib/utils/knora-data-schema.xsd +0 -464
- knora/dsplib/utils/knora-schema-lists.json +0 -83
- knora/dsplib/utils/knora-schema.json +0 -444
- knora/dsplib/utils/onto_commons.py +0 -24
- knora/dsplib/utils/onto_create_lists.py +0 -73
- knora/dsplib/utils/onto_create_ontology.py +0 -451
- knora/dsplib/utils/onto_get.py +0 -58
- knora/dsplib/utils/onto_validate.py +0 -33
- knora/dsplib/utils/xml_upload.py +0 -540
- knora/dsplib/widgets/doublepassword.py +0 -80
- knora/knora.py +0 -2108
- knora/test.py +0 -99
- knora/testit.py +0 -76
- knora/xml2knora.py +0 -633
- {dsplib → dsp_tools/cli}/__init__.py +0 -0
- {dsplib/models → dsp_tools/clients}/__init__.py +0 -0
- {dsplib/utils → dsp_tools/commands}/__init__.py +0 -0
- {dsplib/widgets → dsp_tools/commands/create}/__init__.py +0 -0
- {knora → dsp_tools/commands/create/create_on_server}/__init__.py +0 -0
- {knora/dsplib → dsp_tools/commands/create/models}/__init__.py +0 -0
- {knora/dsplib/models → dsp_tools/commands/create/parsing}/__init__.py +0 -0
- {knora/dsplib/utils → dsp_tools/commands/create/serialisation}/__init__.py +0 -0
- {knora/dsplib/widgets → dsp_tools/commands/excel2json}/__init__.py +0 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
# Update Legal Metadata Command
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The `update-legal` command converts legal metadata in XML files from the old format (text properties)
|
|
6
|
+
to the new format (bitstream attributes). This migration is necessary because:
|
|
7
|
+
|
|
8
|
+
- **Old format**: Legal metadata (authorship, copyright, license) stored as `<text-prop>` elements within resources
|
|
9
|
+
- **New format**: Legal metadata stored as attributes directly on `<bitstream>` or `<iiif-uri>` elements
|
|
10
|
+
|
|
11
|
+
This command automates the migration while handling validation, error correction, and authorship deduplication.
|
|
12
|
+
|
|
13
|
+
## Command Usage
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
dsp-tools update-legal \
|
|
17
|
+
--authorship_prop=":hasAuthor" \
|
|
18
|
+
--copyright_prop=":hasCopyright" \
|
|
19
|
+
--license_prop=":hasLicense" \
|
|
20
|
+
--authorship_default="Project Member" \
|
|
21
|
+
--copyright_default="University" \
|
|
22
|
+
--license_default="CC BY" \
|
|
23
|
+
--fixed_errors="data_legal_errors.csv" \
|
|
24
|
+
data.xml
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
See [docs/special-workflows/update-legal.md](../../../../docs/special-workflows/update-legal.md) for user documentation.
|
|
28
|
+
|
|
29
|
+
## Architecture Overview
|
|
30
|
+
|
|
31
|
+
### Data Flow Pipeline
|
|
32
|
+
|
|
33
|
+
The command follows a multi-stage pipeline:
|
|
34
|
+
|
|
35
|
+
1. **Parse & Validate**: Parse XML file and validate property names exist
|
|
36
|
+
2. **Extract Metadata**: For each resource with media, extract legal metadata using priority system
|
|
37
|
+
3. **Validation**: Check for missing/invalid values
|
|
38
|
+
4. **Error Handling**: If errors exist, write CSV for manual correction
|
|
39
|
+
5. **Update XML**: If no errors, apply metadata as attributes and write updated XML
|
|
40
|
+
6. **Iteration**: User fixes CSV and reruns command until all errors resolved
|
|
41
|
+
|
|
42
|
+
### Priority System
|
|
43
|
+
|
|
44
|
+
Metadata values are resolved using this priority order:
|
|
45
|
+
|
|
46
|
+
1. **CSV corrections** (from `--fixed_errors` file)
|
|
47
|
+
2. **XML properties** (extracted from text-prop elements)
|
|
48
|
+
3. **Default values** (from `--*_default` flags)
|
|
49
|
+
4. **None** (triggers validation error)
|
|
50
|
+
|
|
51
|
+
## Module Responsibilities
|
|
52
|
+
|
|
53
|
+
### [core.py](core.py)
|
|
54
|
+
|
|
55
|
+
Main orchestration and validation logic:
|
|
56
|
+
|
|
57
|
+
- `update_legal_metadata()`: Entry point that coordinates entire workflow
|
|
58
|
+
- `_validate_flags()`: Ensures property names exist in XML
|
|
59
|
+
- `_update_xml_tree()`: Iterates through resources, collects metadata once per resource, decides whether to apply changes
|
|
60
|
+
- `_has_problems()`: Checks if metadata contains FIXME markers or missing values
|
|
61
|
+
- `_update_counter()`: Tracks statistics for final report
|
|
62
|
+
|
|
63
|
+
Key patterns:
|
|
64
|
+
|
|
65
|
+
- Uses functional approach with pure helper functions
|
|
66
|
+
- Clear separation: collection (read-only) vs application (mutations)
|
|
67
|
+
- Single-pass metadata collection eliminates duplicate work
|
|
68
|
+
- Authorship deduplication via `auth_text_to_id` dictionary (maps authorship text to unique ID)
|
|
69
|
+
|
|
70
|
+
### [models.py](models.py)
|
|
71
|
+
|
|
72
|
+
Data structures and configuration:
|
|
73
|
+
|
|
74
|
+
- `LegalProperties`: Configuration for XML property names (e.g., `:hasAuthor`)
|
|
75
|
+
- `LegalMetadata`: Represents legal metadata for a single resource (license, copyright, authorships)
|
|
76
|
+
- `LegalMetadataDefaults`: Default values with automatic license parsing
|
|
77
|
+
- `Problem`: Represents validation error for CSV export
|
|
78
|
+
- `UpdateCounter`: Statistics tracker for final report
|
|
79
|
+
|
|
80
|
+
Important notes:
|
|
81
|
+
|
|
82
|
+
- `LegalMetadataDefaults.__init__()` automatically validates and parses license strings
|
|
83
|
+
using `xmllib.find_license_in_string()`
|
|
84
|
+
- All dataclasses use frozen=True for immutability where appropriate
|
|
85
|
+
|
|
86
|
+
### [csv_operations.py](csv_operations.py)
|
|
87
|
+
|
|
88
|
+
CSV I/O for error handling workflow:
|
|
89
|
+
|
|
90
|
+
- `ProblemAggregator`: Converts problems to DataFrame with dynamic authorship columns
|
|
91
|
+
- `read_corrections_csv()`: Parses CSV corrections into `dict[resource_id, LegalMetadata]`
|
|
92
|
+
- `write_problems_to_csv()`: Writes validation errors to CSV with helpful FIXME markers
|
|
93
|
+
- `is_fixme_value()`: Checks if value starts with "FIXME:" prefix
|
|
94
|
+
|
|
95
|
+
**CSV format:**
|
|
96
|
+
|
|
97
|
+
- Fixed columns: `file`, `resource_id`, `license`, `copyright`
|
|
98
|
+
- Dynamic columns: `authorship_1`, `authorship_2`, ... (as many as needed)
|
|
99
|
+
- Sorted by resource_id for easy navigation
|
|
100
|
+
|
|
101
|
+
**Error prevention:**
|
|
102
|
+
|
|
103
|
+
- Refuses to overwrite existing CSV unless `--fixed_errors` flag provided
|
|
104
|
+
- Helpful error message suggests correct flag usage
|
|
105
|
+
|
|
106
|
+
### [xml_operations.py](xml_operations.py)
|
|
107
|
+
|
|
108
|
+
XML manipulation and metadata application:
|
|
109
|
+
|
|
110
|
+
- `collect_metadata()`: Pure function that collects metadata from CSV, XML, or defaults (read-only)
|
|
111
|
+
- `apply_metadata_to_resource()`: Applies metadata as attributes and removes old text properties (mutations)
|
|
112
|
+
- `_resolve_metadata_values()`: Implements priority system (CSV > XML > defaults)
|
|
113
|
+
- `_extract_license_from_xml()`: Extracts license and validates with `xmllib.find_license_in_string()`
|
|
114
|
+
- `_extract_copyright_from_xml()`: Extracts copyright, detects duplicates
|
|
115
|
+
- `_extract_authorships_from_xml()`: Collects all authorship values
|
|
116
|
+
- `_apply_metadata_to_element()`: Applies metadata as attributes on media element
|
|
117
|
+
- `_remove_text_properties()`: Removes old text-prop elements
|
|
118
|
+
- `add_authorship_definitions_to_xml()`: Creates `<authorship>` definitions at root level
|
|
119
|
+
- `write_final_xml()`: Writes updated XML with statistics
|
|
120
|
+
|
|
121
|
+
**Authorship deduplication:**
|
|
122
|
+
|
|
123
|
+
- Multiple resources can share same authorship (e.g., "Jane Doe, Alice Jenkins")
|
|
124
|
+
- `auth_text_to_id` dictionary tracks unique authorships and assigns sequential IDs
|
|
125
|
+
- Authorship definitions added to root as `<authorship id="authorship_0">` elements
|
|
126
|
+
- Media elements reference via `authorship-id="authorship_0"` attribute
|
|
127
|
+
|
|
128
|
+
**Multiple value detection:**
|
|
129
|
+
|
|
130
|
+
- If multiple copyright values found: returns `"FIXME: Multiple copyrights found. Choose one: ..."`
|
|
131
|
+
- If multiple license values found: returns `"FIXME: Multiple licenses found. Choose one: ..."`
|
|
132
|
+
- This triggers CSV export for manual resolution
|
|
133
|
+
|
|
134
|
+
## Architectural Improvements
|
|
135
|
+
|
|
136
|
+
### Separation of Collection and Application
|
|
137
|
+
|
|
138
|
+
The codebase follows a clear pattern separating read operations from write operations:
|
|
139
|
+
|
|
140
|
+
**Collection Phase (`collect_metadata()`):**
|
|
141
|
+
|
|
142
|
+
- Pure function with no side effects
|
|
143
|
+
- Reads from CSV, XML properties, and defaults
|
|
144
|
+
- Returns `LegalMetadata` object
|
|
145
|
+
- Can be called safely without modifying the XML tree
|
|
146
|
+
- Executes exactly once per resource
|
|
147
|
+
|
|
148
|
+
**Application Phase (`apply_metadata_to_resource()`):**
|
|
149
|
+
|
|
150
|
+
- Mutates the XML tree in-place
|
|
151
|
+
- Applies metadata as attributes on media elements
|
|
152
|
+
- Removes old text properties
|
|
153
|
+
- Manages authorship deduplication dictionary
|
|
154
|
+
- Only called for valid resources (no problems)
|
|
155
|
+
|
|
156
|
+
**Benefits:**
|
|
157
|
+
|
|
158
|
+
- **Performance**: Eliminates duplicate XPath queries (~50% reduction for valid resources)
|
|
159
|
+
- **Clarity**: Clear contract - collection is read-only, application mutates
|
|
160
|
+
- **Safety**: Impossible to accidentally mutate during problem detection
|
|
161
|
+
- **Testability**: Each phase can be tested independently
|
|
162
|
+
|
|
163
|
+
## Key Algorithms
|
|
164
|
+
|
|
165
|
+
### Authorship Deduplication
|
|
166
|
+
|
|
167
|
+
Problem: Multiple resources may share the same authorship (e.g., "Jane Doe, Alice Jenkins").
|
|
168
|
+
|
|
169
|
+
Solution:
|
|
170
|
+
|
|
171
|
+
1. Maintain `auth_text_to_id: dict[str, int]` throughout tree traversal
|
|
172
|
+
2. When applying authorship to media element:
|
|
173
|
+
- Join all authorship values with ", " separator
|
|
174
|
+
- Check if this text already has an ID
|
|
175
|
+
- If not, assign next sequential ID
|
|
176
|
+
- Add `authorship-id="authorship_{id}"` attribute to media element
|
|
177
|
+
3. After tree traversal, create `<authorship>` definitions at root level
|
|
178
|
+
4. Each definition contains `<author>` child elements
|
|
179
|
+
|
|
180
|
+
Result: Shared authorships stored once at root, referenced by multiple resources.
|
|
181
|
+
|
|
182
|
+
### FIXME Value Detection
|
|
183
|
+
|
|
184
|
+
Problem: Need to distinguish between missing values and values that need manual correction.
|
|
185
|
+
|
|
186
|
+
Solution:
|
|
187
|
+
|
|
188
|
+
- Use "FIXME:" prefix for values that need correction
|
|
189
|
+
- `is_fixme_value()` checks for this prefix
|
|
190
|
+
- During extraction:
|
|
191
|
+
- Multiple values: `"FIXME: Multiple X found. Choose one: A, B"`
|
|
192
|
+
- Invalid license: `"FIXME: Invalid license: courtesy of museum"`
|
|
193
|
+
- During validation: FIXME values treated same as missing values
|
|
194
|
+
- During CSV reading: FIXME values converted back to None
|
|
195
|
+
|
|
196
|
+
Result: Clear distinction between "missing" and "needs correction" in CSV workflow.
|
|
197
|
+
|
|
198
|
+
### License Parsing
|
|
199
|
+
|
|
200
|
+
Problem: License strings come in many formats (`CC BY`, `CC-BY-4.0`, `http://rdfh.ch/licenses/cc-by-4.0`).
|
|
201
|
+
|
|
202
|
+
Solution:
|
|
203
|
+
|
|
204
|
+
- Use `xmllib.find_license_in_string()` to parse license text into standardized License enum
|
|
205
|
+
- If parsing fails, return `"FIXME: Invalid license: {text}"`
|
|
206
|
+
- In defaults, parse license string during `__init__()` to fail fast if invalid
|
|
207
|
+
|
|
208
|
+
Result: All licenses normalized to standard IRIs.
|
|
209
|
+
|
|
210
|
+
## Error Handling Strategy
|
|
211
|
+
|
|
212
|
+
### Iterative CSV Correction Workflow
|
|
213
|
+
|
|
214
|
+
1. **First run**: User provides property names and defaults
|
|
215
|
+
- Command extracts metadata using priority system
|
|
216
|
+
- Validates all values
|
|
217
|
+
- If errors found: writes CSV with FIXME markers
|
|
218
|
+
- No XML output created
|
|
219
|
+
|
|
220
|
+
2. **Manual correction**: User fixes CSV
|
|
221
|
+
- Replaces FIXME markers with correct values
|
|
222
|
+
- Can add missing values
|
|
223
|
+
- Can choose between multiple values
|
|
224
|
+
- **Important**: Can modify ANY column (not just FIXME ones) - see "CSV Override Behavior" below
|
|
225
|
+
|
|
226
|
+
3. **Second run**: User provides `--fixed_errors` flag
|
|
227
|
+
- Command loads corrections from CSV
|
|
228
|
+
- CSV corrections take highest priority
|
|
229
|
+
- Validates again
|
|
230
|
+
- If still errors: writes new CSV
|
|
231
|
+
- If no errors: writes updated XML
|
|
232
|
+
|
|
233
|
+
4. **Repeat** until all errors resolved
|
|
234
|
+
|
|
235
|
+
### Validation Rules
|
|
236
|
+
|
|
237
|
+
A resource has problems if:
|
|
238
|
+
|
|
239
|
+
- License is None or FIXME value
|
|
240
|
+
- Copyright is None or FIXME value
|
|
241
|
+
- Authorships is empty list or contains FIXME value
|
|
242
|
+
|
|
243
|
+
Important: A resource must have ALL THREE components valid to avoid CSV export.
|
|
244
|
+
|
|
245
|
+
### CSV Override Behavior
|
|
246
|
+
|
|
247
|
+
**Critical implementation detail:**
|
|
248
|
+
|
|
249
|
+
When `--fixed_errors` is used, ALL non-None CSV values override XML properties and defaults for resources in that CSV.
|
|
250
|
+
This applies to every column, not just FIXME markers.
|
|
251
|
+
|
|
252
|
+
**Priority resolution in `_resolve_metadata_values()`:**
|
|
253
|
+
|
|
254
|
+
1. If CSV has non-None value: use it (skip XML extraction and defaults)
|
|
255
|
+
2. Else if XML has value: use it (skip defaults)
|
|
256
|
+
3. Else if defaults provided: use them
|
|
257
|
+
4. Else: None (triggers validation error)
|
|
258
|
+
|
|
259
|
+
**Note:** FIXME-prefixed values are converted to None during CSV reading, allowing fallback to XML/defaults.
|
|
260
|
+
|
|
261
|
+
### Error Messages
|
|
262
|
+
|
|
263
|
+
User-facing error messages:
|
|
264
|
+
|
|
265
|
+
- Missing property: Caught early in `_validate_flags()` with clear message
|
|
266
|
+
- Existing CSV: Suggests using `--fixed_errors` flag
|
|
267
|
+
- Invalid license default: Raised during `LegalMetadataDefaults.__init__()`
|
|
268
|
+
- FIXME markers in CSV: Provide context about what needs fixing
|
|
269
|
+
|
|
270
|
+
## Testing Considerations
|
|
271
|
+
|
|
272
|
+
### Unit Testing
|
|
273
|
+
|
|
274
|
+
Test each function in isolation:
|
|
275
|
+
|
|
276
|
+
- **Extraction functions**: Test with various XML structures (missing, single, multiple values)
|
|
277
|
+
- **Priority resolution**: Test all combinations of CSV/XML/defaults
|
|
278
|
+
- **FIXME detection**: Test all FIXME marker formats
|
|
279
|
+
- **CSV operations**: Test round-trip (write problems → read corrections)
|
|
280
|
+
- **Authorship deduplication**: Test ID assignment and reuse
|
|
281
|
+
|
|
282
|
+
### Integration Testing
|
|
283
|
+
|
|
284
|
+
Test file I/O and cross-module interactions:
|
|
285
|
+
|
|
286
|
+
- **Full workflow**: Input XML → CSV → corrected CSV → output XML
|
|
287
|
+
- **Property validation**: Missing properties raise correct error
|
|
288
|
+
- **Default values**: Applied when XML values missing
|
|
289
|
+
- **CSV overwrite protection**: Existing CSV prevents accidental overwrite
|
|
290
|
+
|
|
291
|
+
### E2E Testing
|
|
292
|
+
|
|
293
|
+
Test realistic scenarios:
|
|
294
|
+
|
|
295
|
+
- **Simple case**: All metadata present and valid
|
|
296
|
+
- **Missing values**: Some resources missing authorship/copyright/license
|
|
297
|
+
- **Invalid licenses**: Test "courtesy" and other invalid formats
|
|
298
|
+
- **Multiple values**: Resources with multiple copyright/license values
|
|
299
|
+
- **Shared authorships**: Multiple resources with same authorship
|
|
300
|
+
- **Iterative correction**: Multiple runs with CSV corrections
|
|
301
|
+
|
|
302
|
+
### Edge Cases
|
|
303
|
+
|
|
304
|
+
- Empty authorship text values (should be filtered out)
|
|
305
|
+
- Whitespace-only values (should be treated as empty)
|
|
306
|
+
- Resources without media elements (should be skipped)
|
|
307
|
+
- Both bitstream and iiif-uri present (first one used)
|
|
308
|
+
- Unicode in authorship names
|
|
309
|
+
- Very long authorship lists
|
|
310
|
+
|
|
311
|
+
## Common Pitfalls
|
|
312
|
+
|
|
313
|
+
1. **Forgetting `--fixed_errors` flag**: Command will refuse to overwrite existing CSV
|
|
314
|
+
2. **Not providing any property flags**: Caught early with validation error
|
|
315
|
+
3. **Property names with wrong namespace**: Caught early when no matches found
|
|
316
|
+
4. **Leaving FIXME markers in CSV**: Treated as missing values, triggers new CSV
|
|
317
|
+
5. **Invalid license default**: Fails during defaults initialization, not during execution
|
|
318
|
+
|
|
319
|
+
## Performance Considerations
|
|
320
|
+
|
|
321
|
+
- **Single-pass tree traversal**: All resources processed in one iteration
|
|
322
|
+
- **In-memory CSV**: Entire corrections CSV loaded into memory as dictionary
|
|
323
|
+
- **lxml XPath**: Efficient XPath queries for property extraction
|
|
324
|
+
- **Authorship deduplication**: O(1) lookup via dictionary
|
|
325
|
+
- **CSV sorting**: Results sorted by resource_id for easier navigation
|
|
326
|
+
|
|
327
|
+
For typical XML files (thousands of resources), performance should be near-instantaneous.
|
|
328
|
+
|
|
329
|
+
## Dependencies
|
|
330
|
+
|
|
331
|
+
- **lxml**: XML parsing and manipulation
|
|
332
|
+
- **pandas**: CSV I/O with proper column handling
|
|
333
|
+
- **xmllib**: License parsing utilities (`find_license_in_string()`)
|
|
334
|
+
- **dsp_tools.utils.xml_parsing**: XML parsing/validation utilities
|
|
335
|
+
|
|
336
|
+
## Future Improvements
|
|
337
|
+
|
|
338
|
+
Possible enhancements:
|
|
339
|
+
|
|
340
|
+
- Batch processing: Process multiple XML files at once
|
|
341
|
+
- Auto-detection: Try to guess property names from XML structure
|
|
342
|
+
- Validation preview: Show what would be changed without modifying XML
|
|
343
|
+
- Undo functionality: Revert XML back to text properties
|
|
344
|
+
- License suggestions: Use fuzzy matching for invalid licenses
|
|
File without changes
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from lxml import etree
|
|
4
|
+
|
|
5
|
+
from dsp_tools.commands.update_legal.csv_operations import is_fixme_value
|
|
6
|
+
from dsp_tools.commands.update_legal.csv_operations import read_corrections_csv
|
|
7
|
+
from dsp_tools.commands.update_legal.csv_operations import write_problems_to_csv
|
|
8
|
+
from dsp_tools.commands.update_legal.models import Authorships
|
|
9
|
+
from dsp_tools.commands.update_legal.models import LegalMetadata
|
|
10
|
+
from dsp_tools.commands.update_legal.models import LegalMetadataDefaults
|
|
11
|
+
from dsp_tools.commands.update_legal.models import LegalProperties
|
|
12
|
+
from dsp_tools.commands.update_legal.models import Problem
|
|
13
|
+
from dsp_tools.commands.update_legal.models import UpdateCounter
|
|
14
|
+
from dsp_tools.commands.update_legal.xml_operations import add_authorship_definitions_to_xml
|
|
15
|
+
from dsp_tools.commands.update_legal.xml_operations import apply_metadata_to_resource
|
|
16
|
+
from dsp_tools.commands.update_legal.xml_operations import collect_metadata
|
|
17
|
+
from dsp_tools.commands.update_legal.xml_operations import write_updated_xml
|
|
18
|
+
from dsp_tools.error.exceptions import InputError
|
|
19
|
+
from dsp_tools.utils.xml_parsing.parse_clean_validate_xml import parse_xml_file
|
|
20
|
+
from dsp_tools.utils.xml_parsing.parse_clean_validate_xml import transform_into_localnames
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def update_legal_metadata(
|
|
24
|
+
input_file: Path,
|
|
25
|
+
properties: LegalProperties,
|
|
26
|
+
defaults: LegalMetadataDefaults,
|
|
27
|
+
fixed_errors_file: Path | None = None,
|
|
28
|
+
treat_invalid_licenses_as_unknown: bool = False,
|
|
29
|
+
) -> bool:
|
|
30
|
+
"""
|
|
31
|
+
Update legal metadata in an XML file, converting text properties to bitstream attributes.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
input_file: Path to the input XML file
|
|
35
|
+
properties: Configuration for property names to extract from XML
|
|
36
|
+
defaults: Default values to use when metadata is missing
|
|
37
|
+
fixed_errors_file: Path to CSV file with corrected values
|
|
38
|
+
treat_invalid_licenses_as_unknown: If True, invalid licenses are replaced with 'unknown'
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
True if all legal metadata could be updated, False if CSV error file was created
|
|
42
|
+
"""
|
|
43
|
+
csv_corrections = None
|
|
44
|
+
if fixed_errors_file:
|
|
45
|
+
csv_corrections = read_corrections_csv(fixed_errors_file)
|
|
46
|
+
|
|
47
|
+
root = parse_xml_file(input_file)
|
|
48
|
+
root = transform_into_localnames(root)
|
|
49
|
+
_validate_flags(root, properties)
|
|
50
|
+
|
|
51
|
+
root_updated, counter, problems = _update_xml_tree(
|
|
52
|
+
root=root,
|
|
53
|
+
properties=properties,
|
|
54
|
+
defaults=defaults,
|
|
55
|
+
csv_corrections=csv_corrections,
|
|
56
|
+
treat_invalid_licenses_as_unknown=treat_invalid_licenses_as_unknown,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if len(problems) == 0:
|
|
60
|
+
# Success - write fully updated XML with _updated suffix
|
|
61
|
+
write_updated_xml(input_file, root_updated, counter, partial=False)
|
|
62
|
+
return True
|
|
63
|
+
else:
|
|
64
|
+
# Partial update - write both CSV and partial XML
|
|
65
|
+
write_problems_to_csv(input_file, problems)
|
|
66
|
+
write_updated_xml(input_file, root_updated, counter, partial=True)
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _validate_flags(root: etree._Element, properties: LegalProperties) -> None:
|
|
71
|
+
if not properties.has_any_property():
|
|
72
|
+
raise InputError("At least one property (authorship_prop, copyright_prop, license_prop) must be provided")
|
|
73
|
+
text_prop_names = {x for x in root.xpath("//text-prop/@name")}
|
|
74
|
+
inexisting_props = [
|
|
75
|
+
x
|
|
76
|
+
for x in [properties.authorship_prop, properties.copyright_prop, properties.license_prop]
|
|
77
|
+
if x and x not in text_prop_names
|
|
78
|
+
]
|
|
79
|
+
if inexisting_props:
|
|
80
|
+
raise InputError(f"The following properties do not exist in the XML file: {', '.join(inexisting_props)}")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _update_xml_tree(
|
|
84
|
+
root: etree._Element,
|
|
85
|
+
properties: LegalProperties,
|
|
86
|
+
defaults: LegalMetadataDefaults,
|
|
87
|
+
csv_corrections: dict[str, LegalMetadata] | None = None,
|
|
88
|
+
treat_invalid_licenses_as_unknown: bool = False,
|
|
89
|
+
) -> tuple[etree._Element, UpdateCounter, list[Problem]]:
|
|
90
|
+
"""
|
|
91
|
+
Update the XML tree with legal metadata, applying corrections and defaults.
|
|
92
|
+
Resources without problems are fully updated (metadata applied, text properties removed).
|
|
93
|
+
Resources with problems are left unchanged in the XML, but problems are collected for CSV output.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
root: The XML root element
|
|
97
|
+
properties: Configuration for property names to extract from XML
|
|
98
|
+
defaults: Default values to use when metadata is missing
|
|
99
|
+
csv_corrections: Dictionary of corrections from CSV (or None)
|
|
100
|
+
treat_invalid_licenses_as_unknown: If True, invalid licenses are replaced with 'unknown'
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Tuple of (updated root element, counter of updated resources, list of problems)
|
|
104
|
+
"""
|
|
105
|
+
auth_text_to_id: dict[Authorships, int] = {}
|
|
106
|
+
problems: list[Problem] = []
|
|
107
|
+
counter = UpdateCounter()
|
|
108
|
+
|
|
109
|
+
for res in root.iterchildren(tag="resource"):
|
|
110
|
+
if not (media_tag_candidates := res.xpath("bitstream|iiif-uri")):
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
res_id = res.attrib["id"]
|
|
114
|
+
media_elem = media_tag_candidates[0]
|
|
115
|
+
csv_metadata = csv_corrections.get(res_id) if csv_corrections else None
|
|
116
|
+
|
|
117
|
+
metadata = collect_metadata(
|
|
118
|
+
res=res,
|
|
119
|
+
properties=properties,
|
|
120
|
+
defaults=defaults,
|
|
121
|
+
counter=counter,
|
|
122
|
+
csv_metadata=csv_metadata,
|
|
123
|
+
treat_invalid_licenses_as_unknown=treat_invalid_licenses_as_unknown,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if _has_problems(metadata):
|
|
127
|
+
authorships = sorted(x for x in metadata.authorships.elems if x) or ["FIXME: Authorship missing"]
|
|
128
|
+
problem = Problem(
|
|
129
|
+
file_or_iiif_uri=str(media_elem.text).strip(),
|
|
130
|
+
res_id=res_id,
|
|
131
|
+
license=metadata.license or "FIXME: License missing",
|
|
132
|
+
copyright=metadata.copyright or "FIXME: Copyright missing",
|
|
133
|
+
authorships=authorships,
|
|
134
|
+
)
|
|
135
|
+
problems.append(problem)
|
|
136
|
+
elif metadata.any():
|
|
137
|
+
apply_metadata_to_resource(
|
|
138
|
+
res=res,
|
|
139
|
+
media_elem=media_elem,
|
|
140
|
+
metadata=metadata,
|
|
141
|
+
properties=properties,
|
|
142
|
+
auth_text_to_id=auth_text_to_id,
|
|
143
|
+
)
|
|
144
|
+
_update_counter(counter, metadata)
|
|
145
|
+
|
|
146
|
+
if auth_text_to_id:
|
|
147
|
+
add_authorship_definitions_to_xml(root, auth_text_to_id)
|
|
148
|
+
|
|
149
|
+
return root, counter, problems
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _has_problems(metadata: LegalMetadata) -> bool:
|
|
153
|
+
"""
|
|
154
|
+
Check if metadata has any missing or invalid fields that should be reported in CSV.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
metadata: The legal metadata to check
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
True if there are problems, False otherwise
|
|
161
|
+
"""
|
|
162
|
+
has_license_problem = metadata.license is None or is_fixme_value(metadata.license)
|
|
163
|
+
has_copyright_problem = metadata.copyright is None or is_fixme_value(metadata.copyright)
|
|
164
|
+
|
|
165
|
+
if not any(x for x in metadata.authorships.elems if x):
|
|
166
|
+
has_authorship_problem = True
|
|
167
|
+
elif any(is_fixme_value(x) for x in metadata.authorships.elems):
|
|
168
|
+
has_authorship_problem = True
|
|
169
|
+
else:
|
|
170
|
+
has_authorship_problem = False
|
|
171
|
+
|
|
172
|
+
return has_license_problem or has_copyright_problem or has_authorship_problem
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _update_counter(counter: UpdateCounter, metadata: LegalMetadata) -> None:
|
|
176
|
+
counter.resources_updated += 1
|
|
177
|
+
if metadata.license:
|
|
178
|
+
counter.licenses_set += 1
|
|
179
|
+
if metadata.copyright:
|
|
180
|
+
counter.copyrights_set += 1
|
|
181
|
+
if metadata.authorships:
|
|
182
|
+
counter.authorships_set += 1
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import regex
|
|
6
|
+
|
|
7
|
+
from dsp_tools.commands.update_legal.models import Authorships
|
|
8
|
+
from dsp_tools.commands.update_legal.models import LegalMetadata
|
|
9
|
+
from dsp_tools.commands.update_legal.models import Problem
|
|
10
|
+
from dsp_tools.error.exceptions import InputError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class ProblemAggregator:
|
|
15
|
+
"""Aggregates multiple problems and provides DataFrame export for CSV generation."""
|
|
16
|
+
|
|
17
|
+
problems: list[Problem]
|
|
18
|
+
|
|
19
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
20
|
+
"""Convert problems to DataFrame for CSV export."""
|
|
21
|
+
problem_dicts = []
|
|
22
|
+
max_authorships = max((len(p.authorships) for p in self.problems), default=0)
|
|
23
|
+
|
|
24
|
+
for problem in self.problems:
|
|
25
|
+
row_dict = {
|
|
26
|
+
"file": problem.file_or_iiif_uri,
|
|
27
|
+
"resource_id": problem.res_id,
|
|
28
|
+
"license": problem.license,
|
|
29
|
+
"copyright": problem.copyright,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Add authorship columns (authorship_1, authorship_2, etc.)
|
|
33
|
+
for i in range(1, max_authorships + 1):
|
|
34
|
+
auth_idx = i - 1
|
|
35
|
+
authorship_value = problem.authorships[auth_idx] if auth_idx < len(problem.authorships) else ""
|
|
36
|
+
row_dict[f"authorship_{i}"] = authorship_value
|
|
37
|
+
|
|
38
|
+
problem_dicts.append(row_dict)
|
|
39
|
+
|
|
40
|
+
df = pd.DataFrame.from_records(problem_dicts)
|
|
41
|
+
df = df.sort_values(by=["resource_id"])
|
|
42
|
+
|
|
43
|
+
# Ensure column order matches documentation
|
|
44
|
+
base_cols = ["file", "resource_id", "license", "copyright"]
|
|
45
|
+
auth_cols = [f"authorship_{i}" for i in range(1, max_authorships + 1)]
|
|
46
|
+
df = df[base_cols + auth_cols]
|
|
47
|
+
|
|
48
|
+
return df
|
|
49
|
+
|
|
50
|
+
def save_to_csv(self, input_file: Path) -> Path:
|
|
51
|
+
"""
|
|
52
|
+
Save problems to CSV file.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
input_file: The input XML file path, used to determine the output path
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Path to the created CSV file
|
|
59
|
+
"""
|
|
60
|
+
output_path = input_file.parent / f"{input_file.stem}_legal_errors.csv"
|
|
61
|
+
i = 1
|
|
62
|
+
while output_path.exists():
|
|
63
|
+
stem_without_suffix = regex.sub(r"_\d+$", "", output_path.stem)
|
|
64
|
+
output_path = output_path.with_name(f"{stem_without_suffix}_{i}{output_path.suffix}")
|
|
65
|
+
i += 1
|
|
66
|
+
df = self.to_dataframe()
|
|
67
|
+
df.to_csv(output_path, index=False, mode="x")
|
|
68
|
+
return output_path
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def read_corrections_csv(csv_path: Path) -> dict[str, LegalMetadata]:
|
|
72
|
+
"""Read corrected legal metadata from a CSV file, and return a mapping from resource ID to LegalMetadata."""
|
|
73
|
+
df = pd.read_csv(csv_path)
|
|
74
|
+
|
|
75
|
+
# Validate required columns
|
|
76
|
+
required_cols = {"file", "resource_id", "license", "copyright"}
|
|
77
|
+
if not required_cols.issubset(df.columns):
|
|
78
|
+
missing = required_cols - set(df.columns)
|
|
79
|
+
msg = f"CSV file is missing required columns: {missing}"
|
|
80
|
+
raise InputError(msg)
|
|
81
|
+
|
|
82
|
+
corrections = {}
|
|
83
|
+
for _, row in df.iterrows():
|
|
84
|
+
res_id = str(row["resource_id"])
|
|
85
|
+
|
|
86
|
+
license_val = str(row["license"]) if pd.notna(row["license"]) else None
|
|
87
|
+
copyright_val = str(row["copyright"]) if pd.notna(row["copyright"]) else None
|
|
88
|
+
|
|
89
|
+
# Skip rows that still have FIXME markers (not yet corrected)
|
|
90
|
+
if is_fixme_value(license_val):
|
|
91
|
+
license_val = None
|
|
92
|
+
if is_fixme_value(copyright_val):
|
|
93
|
+
copyright_val = None
|
|
94
|
+
|
|
95
|
+
# Collect all authorship columns (authorship_1, authorship_2, etc.)
|
|
96
|
+
authorships = _collect_authorships_from_row(row, df.columns)
|
|
97
|
+
|
|
98
|
+
corrections[res_id] = LegalMetadata(
|
|
99
|
+
license=license_val,
|
|
100
|
+
copyright=copyright_val,
|
|
101
|
+
authorships=authorships,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return corrections
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _collect_authorships_from_row(row: pd.Series, df_columns: pd.Index) -> Authorships:
|
|
108
|
+
"""
|
|
109
|
+
Collect all authorship values from a CSV row.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
List of authorship values (excluding FIXME markers)
|
|
113
|
+
"""
|
|
114
|
+
authorships = []
|
|
115
|
+
i = 1
|
|
116
|
+
while f"authorship_{i}" in df_columns:
|
|
117
|
+
auth_val = row[f"authorship_{i}"]
|
|
118
|
+
if pd.notna(auth_val):
|
|
119
|
+
auth_str = str(auth_val)
|
|
120
|
+
if not is_fixme_value(auth_str):
|
|
121
|
+
authorships.append(auth_str)
|
|
122
|
+
i += 1
|
|
123
|
+
return Authorships.from_iterable(authorships)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def is_fixme_value(value: str | None) -> bool:
|
|
127
|
+
"""Check if a value is a FIXME marker"""
|
|
128
|
+
return value is not None and value.startswith("FIXME:")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def write_problems_to_csv(input_file: Path, problems: list[Problem]) -> None:
|
|
132
|
+
aggregator = ProblemAggregator(problems)
|
|
133
|
+
csv_path = aggregator.save_to_csv(input_file)
|
|
134
|
+
print(f"\n⚠️ Legal metadata contains errors. Please fix them in the CSV file:\n {csv_path}")
|
|
135
|
+
print(f"\nAfter fixing the errors, rerun the command with:\n --fixed_errors={csv_path}")
|