dsp-tools 0.9.13__py3-none-any.whl → 18.3.0.post13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsp_tools/__init__.py +5 -0
- dsp_tools/cli/args.py +47 -0
- dsp_tools/cli/call_action.py +85 -0
- dsp_tools/cli/call_action_files_only.py +101 -0
- dsp_tools/cli/call_action_with_network.py +207 -0
- dsp_tools/cli/create_parsers.py +479 -0
- dsp_tools/cli/entry_point.py +322 -0
- dsp_tools/cli/utils.py +87 -0
- dsp_tools/clients/CLAUDE.md +420 -0
- dsp_tools/clients/authentication_client.py +14 -0
- dsp_tools/clients/authentication_client_live.py +66 -0
- dsp_tools/clients/connection.py +35 -0
- dsp_tools/clients/connection_live.py +233 -0
- dsp_tools/clients/fuseki_metrics.py +60 -0
- dsp_tools/clients/group_user_clients.py +35 -0
- dsp_tools/clients/group_user_clients_live.py +181 -0
- dsp_tools/clients/legal_info_client.py +23 -0
- dsp_tools/clients/legal_info_client_live.py +132 -0
- dsp_tools/clients/list_client.py +49 -0
- dsp_tools/clients/list_client_live.py +166 -0
- dsp_tools/clients/metadata_client.py +24 -0
- dsp_tools/clients/metadata_client_live.py +47 -0
- dsp_tools/clients/ontology_clients.py +49 -0
- dsp_tools/clients/ontology_create_client_live.py +166 -0
- dsp_tools/clients/ontology_get_client_live.py +80 -0
- dsp_tools/clients/permissions_client.py +68 -0
- dsp_tools/clients/project_client.py +16 -0
- dsp_tools/clients/project_client_live.py +66 -0
- dsp_tools/commands/create/communicate_problems.py +24 -0
- dsp_tools/commands/create/create.py +134 -0
- dsp_tools/commands/create/create_on_server/cardinalities.py +111 -0
- dsp_tools/commands/create/create_on_server/classes.py +99 -0
- dsp_tools/commands/create/create_on_server/complete_ontologies.py +116 -0
- dsp_tools/commands/create/create_on_server/default_permissions.py +134 -0
- dsp_tools/commands/create/create_on_server/group_users.py +165 -0
- dsp_tools/commands/create/create_on_server/lists.py +163 -0
- dsp_tools/commands/create/create_on_server/mappers.py +12 -0
- dsp_tools/commands/create/create_on_server/onto_utils.py +74 -0
- dsp_tools/commands/create/create_on_server/ontology.py +52 -0
- dsp_tools/commands/create/create_on_server/project.py +68 -0
- dsp_tools/commands/create/create_on_server/properties.py +119 -0
- dsp_tools/commands/create/exceptions.py +29 -0
- dsp_tools/commands/create/lists_only.py +66 -0
- dsp_tools/commands/create/models/create_problems.py +87 -0
- dsp_tools/commands/create/models/parsed_ontology.py +88 -0
- dsp_tools/commands/create/models/parsed_project.py +81 -0
- dsp_tools/commands/create/models/rdf_ontology.py +12 -0
- dsp_tools/commands/create/models/server_project_info.py +100 -0
- dsp_tools/commands/create/parsing/parse_lists.py +45 -0
- dsp_tools/commands/create/parsing/parse_ontology.py +243 -0
- dsp_tools/commands/create/parsing/parse_project.py +149 -0
- dsp_tools/commands/create/parsing/parsing_utils.py +40 -0
- dsp_tools/commands/create/project_validate.py +595 -0
- dsp_tools/commands/create/serialisation/ontology.py +119 -0
- dsp_tools/commands/create/serialisation/project.py +44 -0
- dsp_tools/commands/excel2json/CLAUDE.md +101 -0
- dsp_tools/commands/excel2json/json_header.py +321 -0
- dsp_tools/commands/excel2json/lists/__init__.py +0 -0
- dsp_tools/commands/excel2json/lists/compliance_checks.py +292 -0
- dsp_tools/commands/excel2json/lists/make_lists.py +247 -0
- dsp_tools/commands/excel2json/lists/models/__init__.py +0 -0
- dsp_tools/commands/excel2json/lists/models/deserialise.py +30 -0
- dsp_tools/commands/excel2json/lists/models/input_error.py +216 -0
- dsp_tools/commands/excel2json/lists/models/serialise.py +57 -0
- dsp_tools/commands/excel2json/lists/utils.py +81 -0
- dsp_tools/commands/excel2json/models/__init__.py +0 -0
- dsp_tools/commands/excel2json/models/input_error.py +416 -0
- dsp_tools/commands/excel2json/models/json_header.py +175 -0
- dsp_tools/commands/excel2json/models/list_node_name.py +16 -0
- dsp_tools/commands/excel2json/models/ontology.py +76 -0
- dsp_tools/commands/excel2json/old_lists.py +328 -0
- dsp_tools/commands/excel2json/project.py +280 -0
- dsp_tools/commands/excel2json/properties.py +370 -0
- dsp_tools/commands/excel2json/resources.py +336 -0
- dsp_tools/commands/excel2json/utils.py +352 -0
- dsp_tools/commands/excel2xml/__init__.py +7 -0
- dsp_tools/commands/excel2xml/excel2xml_cli.py +523 -0
- dsp_tools/commands/excel2xml/excel2xml_lib.py +1953 -0
- dsp_tools/commands/excel2xml/propertyelement.py +47 -0
- dsp_tools/commands/get/__init__.py +0 -0
- dsp_tools/commands/get/get.py +166 -0
- dsp_tools/commands/get/get_permissions.py +257 -0
- dsp_tools/commands/get/get_permissions_legacy.py +89 -0
- dsp_tools/commands/get/legacy_models/__init__.py +0 -0
- dsp_tools/commands/get/legacy_models/context.py +318 -0
- dsp_tools/commands/get/legacy_models/group.py +241 -0
- dsp_tools/commands/get/legacy_models/helpers.py +47 -0
- dsp_tools/commands/get/legacy_models/listnode.py +390 -0
- dsp_tools/commands/get/legacy_models/model.py +12 -0
- dsp_tools/commands/get/legacy_models/ontology.py +324 -0
- dsp_tools/commands/get/legacy_models/project.py +366 -0
- dsp_tools/commands/get/legacy_models/propertyclass.py +417 -0
- dsp_tools/commands/get/legacy_models/resourceclass.py +676 -0
- dsp_tools/commands/get/legacy_models/user.py +438 -0
- dsp_tools/commands/get/models/__init__.py +0 -0
- dsp_tools/commands/get/models/permissions_models.py +10 -0
- dsp_tools/commands/id2iri.py +258 -0
- dsp_tools/commands/ingest_xmlupload/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/bulk_ingest_client.py +178 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/apply_ingest_id.py +69 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/upload_xml.py +166 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/user_information.py +121 -0
- dsp_tools/commands/ingest_xmlupload/ingest_files/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/ingest_files/ingest_files.py +64 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/filechecker.py +20 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py +57 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_failures.py +66 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_files.py +67 -0
- dsp_tools/commands/resume_xmlupload/__init__.py +0 -0
- dsp_tools/commands/resume_xmlupload/resume_xmlupload.py +96 -0
- dsp_tools/commands/start_stack.py +428 -0
- dsp_tools/commands/update_legal/CLAUDE.md +344 -0
- dsp_tools/commands/update_legal/__init__.py +0 -0
- dsp_tools/commands/update_legal/core.py +182 -0
- dsp_tools/commands/update_legal/csv_operations.py +135 -0
- dsp_tools/commands/update_legal/models.py +87 -0
- dsp_tools/commands/update_legal/xml_operations.py +247 -0
- dsp_tools/commands/validate_data/CLAUDE.md +159 -0
- dsp_tools/commands/validate_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/constants.py +59 -0
- dsp_tools/commands/validate_data/mappers.py +143 -0
- dsp_tools/commands/validate_data/models/__init__.py +0 -0
- dsp_tools/commands/validate_data/models/api_responses.py +45 -0
- dsp_tools/commands/validate_data/models/input_problems.py +119 -0
- dsp_tools/commands/validate_data/models/rdf_like_data.py +117 -0
- dsp_tools/commands/validate_data/models/validation.py +106 -0
- dsp_tools/commands/validate_data/prepare_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/prepare_data/get_rdf_like_data.py +296 -0
- dsp_tools/commands/validate_data/prepare_data/make_data_graph.py +91 -0
- dsp_tools/commands/validate_data/prepare_data/prepare_data.py +184 -0
- dsp_tools/commands/validate_data/process_validation_report/__init__.py +0 -0
- dsp_tools/commands/validate_data/process_validation_report/get_user_validation_message.py +358 -0
- dsp_tools/commands/validate_data/process_validation_report/query_validation_result.py +507 -0
- dsp_tools/commands/validate_data/process_validation_report/reformat_validation_results.py +150 -0
- dsp_tools/commands/validate_data/shacl_cli_validator.py +70 -0
- dsp_tools/commands/validate_data/sparql/__init__.py +0 -0
- dsp_tools/commands/validate_data/sparql/cardinality_shacl.py +209 -0
- dsp_tools/commands/validate_data/sparql/construct_shacl.py +92 -0
- dsp_tools/commands/validate_data/sparql/legal_info_shacl.py +36 -0
- dsp_tools/commands/validate_data/sparql/value_shacl.py +357 -0
- dsp_tools/commands/validate_data/utils.py +59 -0
- dsp_tools/commands/validate_data/validate_data.py +283 -0
- dsp_tools/commands/validate_data/validation/__init__.py +0 -0
- dsp_tools/commands/validate_data/validation/check_duplicate_files.py +55 -0
- dsp_tools/commands/validate_data/validation/check_for_unknown_classes.py +67 -0
- dsp_tools/commands/validate_data/validation/get_validation_report.py +94 -0
- dsp_tools/commands/validate_data/validation/validate_ontology.py +107 -0
- dsp_tools/commands/xmlupload/CLAUDE.md +292 -0
- dsp_tools/commands/xmlupload/__init__.py +0 -0
- dsp_tools/commands/xmlupload/iri_resolver.py +21 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/__init__.py +0 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/constants.py +63 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/jsonld_utils.py +44 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_file_value.py +77 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_resource_and_values.py +114 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_values.py +262 -0
- dsp_tools/commands/xmlupload/models/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/bitstream_info.py +18 -0
- dsp_tools/commands/xmlupload/models/formatted_text_value.py +10 -0
- dsp_tools/commands/xmlupload/models/ingest.py +143 -0
- dsp_tools/commands/xmlupload/models/input_problems.py +58 -0
- dsp_tools/commands/xmlupload/models/lookup_models.py +21 -0
- dsp_tools/commands/xmlupload/models/permission.py +45 -0
- dsp_tools/commands/xmlupload/models/permissions_parsed.py +93 -0
- dsp_tools/commands/xmlupload/models/processed/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/processed/file_values.py +29 -0
- dsp_tools/commands/xmlupload/models/processed/res.py +27 -0
- dsp_tools/commands/xmlupload/models/processed/values.py +101 -0
- dsp_tools/commands/xmlupload/models/rdf_models.py +26 -0
- dsp_tools/commands/xmlupload/models/upload_clients.py +14 -0
- dsp_tools/commands/xmlupload/models/upload_state.py +20 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/__init__.py +0 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/ark2iri.py +55 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/get_processed_resources.py +252 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/iiif_uri_validator.py +50 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/list_client.py +120 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/prepare_xml_input.py +67 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/read_validate_xml_file.py +58 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/transform_input_values.py +118 -0
- dsp_tools/commands/xmlupload/resource_create_client.py +25 -0
- dsp_tools/commands/xmlupload/richtext_id2iri.py +37 -0
- dsp_tools/commands/xmlupload/stash/__init__.py +0 -0
- dsp_tools/commands/xmlupload/stash/analyse_circular_reference_graph.py +236 -0
- dsp_tools/commands/xmlupload/stash/create_info_for_graph.py +53 -0
- dsp_tools/commands/xmlupload/stash/graph_models.py +87 -0
- dsp_tools/commands/xmlupload/stash/stash_circular_references.py +68 -0
- dsp_tools/commands/xmlupload/stash/stash_models.py +109 -0
- dsp_tools/commands/xmlupload/stash/upload_stashed_resptr_props.py +106 -0
- dsp_tools/commands/xmlupload/stash/upload_stashed_xml_texts.py +196 -0
- dsp_tools/commands/xmlupload/upload_config.py +76 -0
- dsp_tools/commands/xmlupload/write_diagnostic_info.py +27 -0
- dsp_tools/commands/xmlupload/xmlupload.py +516 -0
- dsp_tools/config/__init__.py +0 -0
- dsp_tools/config/logger_config.py +69 -0
- dsp_tools/config/warnings_config.py +32 -0
- dsp_tools/error/__init__.py +0 -0
- dsp_tools/error/custom_warnings.py +39 -0
- dsp_tools/error/exceptions.py +204 -0
- dsp_tools/error/problems.py +10 -0
- dsp_tools/error/xmllib_errors.py +20 -0
- dsp_tools/error/xmllib_warnings.py +54 -0
- dsp_tools/error/xmllib_warnings_util.py +159 -0
- dsp_tools/error/xsd_validation_error_msg.py +19 -0
- dsp_tools/legacy_models/__init__.py +0 -0
- dsp_tools/legacy_models/datetimestamp.py +81 -0
- dsp_tools/legacy_models/langstring.py +253 -0
- dsp_tools/legacy_models/projectContext.py +49 -0
- dsp_tools/py.typed +0 -0
- dsp_tools/resources/schema/data.xsd +648 -0
- dsp_tools/resources/schema/lists-only.json +72 -0
- dsp_tools/resources/schema/project.json +1258 -0
- dsp_tools/resources/schema/properties-only.json +874 -0
- dsp_tools/resources/schema/resources-only.json +140 -0
- dsp_tools/resources/start-stack/docker-compose.override-host.j2 +11 -0
- dsp_tools/resources/start-stack/docker-compose.override.yml +11 -0
- dsp_tools/resources/start-stack/docker-compose.yml +88 -0
- dsp_tools/resources/start-stack/dsp-app-config.json +45 -0
- dsp_tools/resources/start-stack/dsp-app-config.override-host.j2 +26 -0
- dsp_tools/resources/validate_data/api-shapes-resource-cardinalities.ttl +191 -0
- dsp_tools/resources/validate_data/api-shapes.ttl +804 -0
- dsp_tools/resources/validate_data/shacl-cli-image.yml +4 -0
- dsp_tools/resources/validate_data/validate-ontology.ttl +99 -0
- dsp_tools/utils/__init__.py +0 -0
- dsp_tools/utils/ansi_colors.py +32 -0
- dsp_tools/utils/data_formats/__init__.py +0 -0
- dsp_tools/utils/data_formats/date_util.py +166 -0
- dsp_tools/utils/data_formats/iri_util.py +30 -0
- dsp_tools/utils/data_formats/shared.py +81 -0
- dsp_tools/utils/data_formats/uri_util.py +76 -0
- dsp_tools/utils/fuseki_bloating.py +63 -0
- dsp_tools/utils/json_parsing.py +22 -0
- dsp_tools/utils/rdf_constants.py +42 -0
- dsp_tools/utils/rdflib_utils.py +10 -0
- dsp_tools/utils/replace_id_with_iri.py +66 -0
- dsp_tools/utils/request_utils.py +238 -0
- dsp_tools/utils/xml_parsing/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/get_lookups.py +32 -0
- dsp_tools/utils/xml_parsing/get_parsed_resources.py +325 -0
- dsp_tools/utils/xml_parsing/models/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/models/parsed_resource.py +76 -0
- dsp_tools/utils/xml_parsing/parse_clean_validate_xml.py +137 -0
- dsp_tools/xmllib/CLAUDE.md +302 -0
- dsp_tools/xmllib/__init__.py +49 -0
- dsp_tools/xmllib/general_functions.py +877 -0
- dsp_tools/xmllib/internal/__init__.py +0 -0
- dsp_tools/xmllib/internal/checkers.py +162 -0
- dsp_tools/xmllib/internal/circumvent_circular_imports.py +36 -0
- dsp_tools/xmllib/internal/constants.py +46 -0
- dsp_tools/xmllib/internal/input_converters.py +155 -0
- dsp_tools/xmllib/internal/serialise_file_value.py +57 -0
- dsp_tools/xmllib/internal/serialise_resource.py +177 -0
- dsp_tools/xmllib/internal/serialise_values.py +152 -0
- dsp_tools/xmllib/internal/type_aliases.py +11 -0
- dsp_tools/xmllib/models/__init__.py +0 -0
- dsp_tools/xmllib/models/config_options.py +28 -0
- dsp_tools/xmllib/models/date_formats.py +48 -0
- dsp_tools/xmllib/models/dsp_base_resources.py +1542 -0
- dsp_tools/xmllib/models/internal/__init__.py +0 -0
- dsp_tools/xmllib/models/internal/file_values.py +172 -0
- dsp_tools/xmllib/models/internal/geometry.py +162 -0
- dsp_tools/xmllib/models/internal/migration_metadata.py +55 -0
- dsp_tools/xmllib/models/internal/serialise_permissions.py +66 -0
- dsp_tools/xmllib/models/internal/values.py +342 -0
- dsp_tools/xmllib/models/licenses/__init__.py +0 -0
- dsp_tools/xmllib/models/licenses/other.py +59 -0
- dsp_tools/xmllib/models/licenses/recommended.py +107 -0
- dsp_tools/xmllib/models/permissions.py +41 -0
- dsp_tools/xmllib/models/res.py +1782 -0
- dsp_tools/xmllib/models/root.py +348 -0
- dsp_tools/xmllib/value_checkers.py +434 -0
- dsp_tools/xmllib/value_converters.py +777 -0
- dsp_tools-18.3.0.post13.dist-info/METADATA +90 -0
- dsp_tools-18.3.0.post13.dist-info/RECORD +286 -0
- dsp_tools-18.3.0.post13.dist-info/WHEEL +4 -0
- dsp_tools-18.3.0.post13.dist-info/entry_points.txt +3 -0
- dsp_tools-0.9.13.dist-info/LICENSE +0 -674
- dsp_tools-0.9.13.dist-info/METADATA +0 -144
- dsp_tools-0.9.13.dist-info/RECORD +0 -71
- dsp_tools-0.9.13.dist-info/WHEEL +0 -5
- dsp_tools-0.9.13.dist-info/entry_points.txt +0 -3
- dsp_tools-0.9.13.dist-info/top_level.txt +0 -1
- dsplib/models/connection.py +0 -272
- dsplib/models/group.py +0 -296
- dsplib/models/helpers.py +0 -505
- dsplib/models/langstring.py +0 -277
- dsplib/models/listnode.py +0 -578
- dsplib/models/model.py +0 -20
- dsplib/models/ontology.py +0 -448
- dsplib/models/permission.py +0 -112
- dsplib/models/project.py +0 -547
- dsplib/models/propertyclass.py +0 -505
- dsplib/models/resource.py +0 -366
- dsplib/models/resourceclass.py +0 -810
- dsplib/models/sipi.py +0 -30
- dsplib/models/user.py +0 -731
- dsplib/models/value.py +0 -1000
- dsplib/utils/knora-data-schema.xsd +0 -454
- dsplib/utils/knora-schema-lists.json +0 -83
- dsplib/utils/knora-schema.json +0 -434
- dsplib/utils/onto_commons.py +0 -24
- dsplib/utils/onto_create_lists.py +0 -73
- dsplib/utils/onto_create_ontology.py +0 -442
- dsplib/utils/onto_get.py +0 -58
- dsplib/utils/onto_validate.py +0 -33
- dsplib/utils/xml_upload.py +0 -539
- dsplib/widgets/doublepassword.py +0 -80
- knora/MLS-import-libraries.py +0 -84
- knora/dsp_tools.py +0 -96
- knora/dsplib/models/connection.py +0 -272
- knora/dsplib/models/group.py +0 -296
- knora/dsplib/models/helpers.py +0 -506
- knora/dsplib/models/langstring.py +0 -277
- knora/dsplib/models/listnode.py +0 -578
- knora/dsplib/models/model.py +0 -20
- knora/dsplib/models/ontology.py +0 -448
- knora/dsplib/models/permission.py +0 -112
- knora/dsplib/models/project.py +0 -583
- knora/dsplib/models/propertyclass.py +0 -505
- knora/dsplib/models/resource.py +0 -416
- knora/dsplib/models/resourceclass.py +0 -811
- knora/dsplib/models/sipi.py +0 -35
- knora/dsplib/models/user.py +0 -731
- knora/dsplib/models/value.py +0 -1000
- knora/dsplib/utils/knora-data-schema.xsd +0 -464
- knora/dsplib/utils/knora-schema-lists.json +0 -83
- knora/dsplib/utils/knora-schema.json +0 -444
- knora/dsplib/utils/onto_commons.py +0 -24
- knora/dsplib/utils/onto_create_lists.py +0 -73
- knora/dsplib/utils/onto_create_ontology.py +0 -451
- knora/dsplib/utils/onto_get.py +0 -58
- knora/dsplib/utils/onto_validate.py +0 -33
- knora/dsplib/utils/xml_upload.py +0 -540
- knora/dsplib/widgets/doublepassword.py +0 -80
- knora/knora.py +0 -2108
- knora/test.py +0 -99
- knora/testit.py +0 -76
- knora/xml2knora.py +0 -633
- {dsplib → dsp_tools/cli}/__init__.py +0 -0
- {dsplib/models → dsp_tools/clients}/__init__.py +0 -0
- {dsplib/utils → dsp_tools/commands}/__init__.py +0 -0
- {dsplib/widgets → dsp_tools/commands/create}/__init__.py +0 -0
- {knora → dsp_tools/commands/create/create_on_server}/__init__.py +0 -0
- {knora/dsplib → dsp_tools/commands/create/models}/__init__.py +0 -0
- {knora/dsplib/models → dsp_tools/commands/create/parsing}/__init__.py +0 -0
- {knora/dsplib/utils → dsp_tools/commands/create/serialisation}/__init__.py +0 -0
- {knora/dsplib/widgets → dsp_tools/commands/excel2json}/__init__.py +0 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib.resources
|
|
4
|
+
import json
|
|
5
|
+
import warnings
|
|
6
|
+
from copy import deepcopy
|
|
7
|
+
from typing import Any
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import jsonpath_ng.ext
|
|
11
|
+
import jsonschema
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import regex
|
|
14
|
+
|
|
15
|
+
from dsp_tools.commands.excel2json.models.input_error import ExcelFileProblem
|
|
16
|
+
from dsp_tools.commands.excel2json.models.input_error import ExcelSheetProblem
|
|
17
|
+
from dsp_tools.commands.excel2json.models.input_error import JsonValidationResourceProblem
|
|
18
|
+
from dsp_tools.commands.excel2json.models.input_error import MandatorySheetsMissingProblem
|
|
19
|
+
from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem
|
|
20
|
+
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
|
|
21
|
+
from dsp_tools.commands.excel2json.models.input_error import ResourceSheetNotListedProblem
|
|
22
|
+
from dsp_tools.commands.excel2json.models.json_header import PermissionsOverrulesUnprefixed
|
|
23
|
+
from dsp_tools.commands.excel2json.models.ontology import OntoResource
|
|
24
|
+
from dsp_tools.commands.excel2json.models.ontology import ResourceCardinality
|
|
25
|
+
from dsp_tools.commands.excel2json.utils import add_optional_columns
|
|
26
|
+
from dsp_tools.commands.excel2json.utils import check_column_for_duplicate
|
|
27
|
+
from dsp_tools.commands.excel2json.utils import check_contains_required_columns
|
|
28
|
+
from dsp_tools.commands.excel2json.utils import check_permissions
|
|
29
|
+
from dsp_tools.commands.excel2json.utils import find_missing_required_values
|
|
30
|
+
from dsp_tools.commands.excel2json.utils import get_comments
|
|
31
|
+
from dsp_tools.commands.excel2json.utils import get_labels
|
|
32
|
+
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
|
|
33
|
+
from dsp_tools.error.exceptions import InputError
|
|
34
|
+
from dsp_tools.error.problems import Problem
|
|
35
|
+
|
|
36
|
+
languages = ["en", "de", "fr", "it", "rm"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def excel2resources(
|
|
40
|
+
excelfile: str,
|
|
41
|
+
path_to_output_file: Optional[str] = None,
|
|
42
|
+
) -> tuple[list[dict[str, Any]], PermissionsOverrulesUnprefixed, bool]:
|
|
43
|
+
"""
|
|
44
|
+
Converts resources described in an Excel file into a "resources" section which can be inserted into a JSON
|
|
45
|
+
project file.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
excelfile: path to the Excel file containing the resources
|
|
49
|
+
path_to_output_file: if provided, the output is written into this JSON file
|
|
50
|
+
(otherwise, it's only returned as return value)
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
InputError: if something went wrong
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
- the "resources" section as Python list,
|
|
57
|
+
- the unprefixed "default_permissions_overrule",
|
|
58
|
+
- the success status (True if everything went well)
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
all_dfs = read_and_clean_all_sheets(excelfile)
|
|
62
|
+
|
|
63
|
+
if validation_problems := _validate_excel_file(all_dfs):
|
|
64
|
+
msg = validation_problems.execute_error_protocol()
|
|
65
|
+
raise InputError(msg)
|
|
66
|
+
classes_df, resource_dfs = _prepare_classes_df(all_dfs)
|
|
67
|
+
|
|
68
|
+
# transform every row into a resource
|
|
69
|
+
res = [_row2resource(row, resource_dfs.get(row["name"])) for i, row in classes_df.iterrows()]
|
|
70
|
+
resources = [x.serialise() for x in res]
|
|
71
|
+
default_permissions_overrule = _extract_default_permissions_overrule(classes_df)
|
|
72
|
+
|
|
73
|
+
# write final "resources" section into a JSON file
|
|
74
|
+
_validate_resources(resources_list=resources)
|
|
75
|
+
|
|
76
|
+
if path_to_output_file:
|
|
77
|
+
with open(file=path_to_output_file, mode="w", encoding="utf-8") as file:
|
|
78
|
+
json.dump(resources, file, indent=4, ensure_ascii=False)
|
|
79
|
+
print(f"resources section was created successfully and written to file '{path_to_output_file}'")
|
|
80
|
+
|
|
81
|
+
return resources, default_permissions_overrule, True
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _validate_excel_file(all_dfs: dict[str, pd.DataFrame]) -> ExcelFileProblem | None:
|
|
85
|
+
df_dict = deepcopy(all_dfs)
|
|
86
|
+
lower_case_to_original = {k.lower(): k for k in df_dict}
|
|
87
|
+
if not (cls_name := lower_case_to_original.get("classes")):
|
|
88
|
+
return ExcelFileProblem(
|
|
89
|
+
"resources.xlsx", [MandatorySheetsMissingProblem(["classes"], list(lower_case_to_original.values()))]
|
|
90
|
+
)
|
|
91
|
+
classes_df = df_dict.pop(cls_name)
|
|
92
|
+
problems: list[Problem] = []
|
|
93
|
+
if cls_problem := _validate_classes_excel_sheet(classes_df, set(df_dict)):
|
|
94
|
+
problems.append(cls_problem)
|
|
95
|
+
if sheet_problems := _validate_individual_class_sheets(df_dict):
|
|
96
|
+
problems.extend(sheet_problems)
|
|
97
|
+
if permissions_prob := check_permissions(df=classes_df, allowed_vals=["private", "limited_view"]):
|
|
98
|
+
problems.append(permissions_prob)
|
|
99
|
+
if problems:
|
|
100
|
+
return ExcelFileProblem("resources.xlsx", problems)
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _validate_classes_excel_sheet(classes_df: pd.DataFrame, sheet_names: set[str]) -> ExcelSheetProblem | None:
|
|
105
|
+
if any(classes_df.get(lang) is not None for lang in languages):
|
|
106
|
+
warnings.warn(
|
|
107
|
+
f"The file 'resources.xlsx' uses {languages} as column titles, which is deprecated. "
|
|
108
|
+
f"Please use {[f'label_{lang}' for lang in languages]}"
|
|
109
|
+
)
|
|
110
|
+
problems: list[Problem] = []
|
|
111
|
+
required_cols = ["name", "super"]
|
|
112
|
+
if missing_cols := check_contains_required_columns(classes_df, set(required_cols)):
|
|
113
|
+
# If this condition is not fulfilled the following tests will produce KeyErrors
|
|
114
|
+
return ExcelSheetProblem("classes", [missing_cols])
|
|
115
|
+
names_listed = set(classes_df["name"].tolist())
|
|
116
|
+
if not sheet_names.issubset(names_listed):
|
|
117
|
+
problems.append(ResourceSheetNotListedProblem(sheet_names - names_listed))
|
|
118
|
+
if missing_values := find_missing_required_values(classes_df, required_cols):
|
|
119
|
+
problems.extend(missing_values)
|
|
120
|
+
if duplicate_check := check_column_for_duplicate(classes_df, "name"):
|
|
121
|
+
problems.append(duplicate_check)
|
|
122
|
+
if problems:
|
|
123
|
+
return ExcelSheetProblem("classes", problems)
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _validate_individual_class_sheets(class_df_dict: dict[str, pd.DataFrame]) -> list[Problem]:
|
|
128
|
+
required_cols = ["property", "cardinality"]
|
|
129
|
+
missing_required_columns = {
|
|
130
|
+
sheet: missing_cols
|
|
131
|
+
for sheet, df in class_df_dict.items()
|
|
132
|
+
if (missing_cols := check_contains_required_columns(df, set(required_cols)))
|
|
133
|
+
}
|
|
134
|
+
if missing_required_columns:
|
|
135
|
+
return [ExcelSheetProblem(sheet, [missing]) for sheet, missing in missing_required_columns.items()]
|
|
136
|
+
missing_values_position: list[PositionInExcel] = []
|
|
137
|
+
for sheet_name, df in class_df_dict.items():
|
|
138
|
+
if missing_vals_position := find_missing_required_values(df, required_cols, sheet_name):
|
|
139
|
+
missing_values_position.extend(missing_vals_position)
|
|
140
|
+
if missing_values_position:
|
|
141
|
+
return [MissingValuesProblem(missing_values_position)]
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _prepare_classes_df(resource_dfs: dict[str, pd.DataFrame]) -> tuple[pd.DataFrame, dict[str, pd.DataFrame]]:
|
|
146
|
+
lower_case_to_original = {k.lower(): k for k in resource_dfs}
|
|
147
|
+
classes_df = resource_dfs.pop(lower_case_to_original["classes"])
|
|
148
|
+
classes_df = add_optional_columns(
|
|
149
|
+
classes_df,
|
|
150
|
+
{
|
|
151
|
+
"label_en",
|
|
152
|
+
"label_de",
|
|
153
|
+
"label_fr",
|
|
154
|
+
"label_it",
|
|
155
|
+
"label_rm",
|
|
156
|
+
"comment_en",
|
|
157
|
+
"comment_de",
|
|
158
|
+
"comment_fr",
|
|
159
|
+
"comment_it",
|
|
160
|
+
"comment_rm",
|
|
161
|
+
"default_permissions_overrule",
|
|
162
|
+
},
|
|
163
|
+
)
|
|
164
|
+
resource_dfs = {k: add_optional_columns(v, {"gui_order"}) for k, v in resource_dfs.items()}
|
|
165
|
+
return classes_df, resource_dfs
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _row2resource(
|
|
169
|
+
class_info_row: pd.Series[Any],
|
|
170
|
+
class_df_with_cardinalities: pd.DataFrame | None,
|
|
171
|
+
) -> OntoResource:
|
|
172
|
+
"""
|
|
173
|
+
Method that reads one row from the "classes" DataFrame,
|
|
174
|
+
opens the corresponding details DataFrame,
|
|
175
|
+
and builds a dict object of the resource.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
class_info_row: row from the "classes" DataFrame
|
|
179
|
+
class_df_with_cardinalities: Excel sheet of the individual class
|
|
180
|
+
|
|
181
|
+
Raises:
|
|
182
|
+
InputError: if the row or the details sheet contains invalid data
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
dict object of the resource
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
class_name = class_info_row["name"]
|
|
189
|
+
labels = get_labels(class_info_row)
|
|
190
|
+
supers = [s.strip() for s in class_info_row["super"].split(",")]
|
|
191
|
+
comments = get_comments(class_info_row)
|
|
192
|
+
cards = _make_cardinality_section(class_name, class_df_with_cardinalities)
|
|
193
|
+
return OntoResource(name=class_name, super=supers, labels=labels, comments=comments, cardinalities=cards)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _make_cardinality_section(
|
|
197
|
+
class_name: str, class_df_with_cardinalities: pd.DataFrame | None
|
|
198
|
+
) -> list[ResourceCardinality] | None:
|
|
199
|
+
if class_df_with_cardinalities is None:
|
|
200
|
+
return None
|
|
201
|
+
if len(class_df_with_cardinalities) == 0:
|
|
202
|
+
return None
|
|
203
|
+
return _create_all_cardinalities(class_name, class_df_with_cardinalities)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _create_all_cardinalities(class_name: str, class_df_with_cardinalities: pd.DataFrame) -> list[ResourceCardinality]:
|
|
207
|
+
class_df_with_cardinalities = _check_complete_gui_order(class_name, class_df_with_cardinalities)
|
|
208
|
+
cards = [_make_one_cardinality(detail_row) for _, detail_row in class_df_with_cardinalities.iterrows()]
|
|
209
|
+
return cards
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _make_one_cardinality(detail_row: pd.Series[str | int]) -> ResourceCardinality:
|
|
213
|
+
prop_str = str(detail_row["property"])
|
|
214
|
+
knora_props = ["seqnum", "isPartOf"]
|
|
215
|
+
prop = prop_str if ":" in prop_str or prop_str in knora_props else f":{prop_str}"
|
|
216
|
+
return ResourceCardinality(prop, str(detail_row["cardinality"]).lower(), int(detail_row["gui_order"]))
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _check_complete_gui_order(class_name: str, class_df_with_cardinalities: pd.DataFrame) -> pd.DataFrame:
|
|
220
|
+
detail_problem_msg = ""
|
|
221
|
+
attempt_conversion = False
|
|
222
|
+
if "gui_order" not in class_df_with_cardinalities:
|
|
223
|
+
pass
|
|
224
|
+
elif class_df_with_cardinalities["gui_order"].isna().all():
|
|
225
|
+
pass
|
|
226
|
+
elif class_df_with_cardinalities["gui_order"].isna().any():
|
|
227
|
+
detail_problem_msg = "some rows in the column 'gui_order' are empty."
|
|
228
|
+
elif not class_df_with_cardinalities["gui_order"].isna().all():
|
|
229
|
+
attempt_conversion = True
|
|
230
|
+
|
|
231
|
+
if attempt_conversion:
|
|
232
|
+
try:
|
|
233
|
+
class_df_with_cardinalities["gui_order"] = [int(float(x)) for x in class_df_with_cardinalities["gui_order"]]
|
|
234
|
+
return class_df_with_cardinalities
|
|
235
|
+
except ValueError:
|
|
236
|
+
detail_problem_msg = (
|
|
237
|
+
"some rows in the column 'gui_order' contain invalid characters "
|
|
238
|
+
"that could not be converted to an integer."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
class_df_with_cardinalities["gui_order"] = list(range(1, len(class_df_with_cardinalities) + 1))
|
|
242
|
+
|
|
243
|
+
if detail_problem_msg:
|
|
244
|
+
complete_msg = (
|
|
245
|
+
f"In the sheet '{class_name}' of the file 'resources.xlsx', "
|
|
246
|
+
f"{detail_problem_msg}\n"
|
|
247
|
+
f"Values have been filled in automatically, "
|
|
248
|
+
f"so that the gui-order reflects the order of the properties in the file."
|
|
249
|
+
)
|
|
250
|
+
warnings.warn(complete_msg)
|
|
251
|
+
return class_df_with_cardinalities
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _validate_resources(resources_list: list[dict[str, Any]]) -> None:
|
|
255
|
+
"""
|
|
256
|
+
This function checks if the "resources" section of a JSON project file is valid according to the JSON schema,
|
|
257
|
+
and if the resource names are unique.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
resources_list: the "resources" section of a JSON project as a list of dicts
|
|
261
|
+
|
|
262
|
+
Raises:
|
|
263
|
+
InputError: if the validation fails
|
|
264
|
+
"""
|
|
265
|
+
with (
|
|
266
|
+
importlib.resources.files("dsp_tools")
|
|
267
|
+
.joinpath("resources/schema/resources-only.json")
|
|
268
|
+
.open(encoding="utf-8") as schema_file
|
|
269
|
+
):
|
|
270
|
+
resources_schema = json.load(schema_file)
|
|
271
|
+
try:
|
|
272
|
+
jsonschema.validate(instance=resources_list, schema=resources_schema)
|
|
273
|
+
except jsonschema.ValidationError as err:
|
|
274
|
+
validation_problem = _find_validation_problem(
|
|
275
|
+
validation_error=err,
|
|
276
|
+
resources_list=resources_list,
|
|
277
|
+
)
|
|
278
|
+
msg = "\nThe Excel file 'resources.xlsx' did not pass validation." + validation_problem.execute_error_protocol()
|
|
279
|
+
raise InputError(msg) from None
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _find_validation_problem(
|
|
283
|
+
validation_error: jsonschema.ValidationError, resources_list: list[dict[str, Any]]
|
|
284
|
+
) -> JsonValidationResourceProblem:
|
|
285
|
+
if json_path_to_resource := regex.search(r"^\$\[(\d+)\]", validation_error.json_path):
|
|
286
|
+
# fmt: off
|
|
287
|
+
wrong_res_name = (
|
|
288
|
+
jsonpath_ng.ext.parse(json_path_to_resource.group(0))
|
|
289
|
+
.find(resources_list)[0]
|
|
290
|
+
.value["name"]
|
|
291
|
+
)
|
|
292
|
+
# fmt: on
|
|
293
|
+
if affected_field := regex.search(
|
|
294
|
+
r"name|labels|comments|super|cardinalities\[(\d+)\]", validation_error.json_path
|
|
295
|
+
):
|
|
296
|
+
affected_value = affected_field.group(0)
|
|
297
|
+
problematic_resource, excel_sheet, excel_row, excel_column = "", None, None, None
|
|
298
|
+
|
|
299
|
+
if affected_value in ["name", "labels", "comments", "super"]:
|
|
300
|
+
excel_sheet = "classes"
|
|
301
|
+
problematic_resource = wrong_res_name
|
|
302
|
+
excel_row = int(json_path_to_resource.group(1)) + 2
|
|
303
|
+
excel_column = affected_value
|
|
304
|
+
|
|
305
|
+
elif "cardinalities" in affected_value:
|
|
306
|
+
excel_row = int(affected_field.group(1)) + 2
|
|
307
|
+
excel_sheet = wrong_res_name
|
|
308
|
+
|
|
309
|
+
if validation_error.json_path.endswith("cardinality"):
|
|
310
|
+
excel_column = "Cardinality"
|
|
311
|
+
|
|
312
|
+
elif validation_error.json_path.endswith("propname"):
|
|
313
|
+
excel_column = "Property"
|
|
314
|
+
|
|
315
|
+
return JsonValidationResourceProblem(
|
|
316
|
+
problematic_resource=problematic_resource,
|
|
317
|
+
excel_position=PositionInExcel(sheet=excel_sheet, column=excel_column, row=excel_row),
|
|
318
|
+
original_msg=validation_error.message,
|
|
319
|
+
)
|
|
320
|
+
return JsonValidationResourceProblem(
|
|
321
|
+
original_msg=validation_error.message,
|
|
322
|
+
message_path=validation_error.json_path,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _extract_default_permissions_overrule(classes_df: pd.DataFrame) -> PermissionsOverrulesUnprefixed:
|
|
327
|
+
result = PermissionsOverrulesUnprefixed(private=[], limited_view=[])
|
|
328
|
+
for _, row in classes_df.iterrows():
|
|
329
|
+
perm = row.get("default_permissions_overrule")
|
|
330
|
+
if pd.isna(perm):
|
|
331
|
+
continue
|
|
332
|
+
if perm.strip().lower() == "private":
|
|
333
|
+
result.private.append(row["name"])
|
|
334
|
+
elif perm.strip().lower() == "limited_view":
|
|
335
|
+
result.limited_view.append(row["name"])
|
|
336
|
+
return result
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
from unittest import mock
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import regex
|
|
10
|
+
|
|
11
|
+
from dsp_tools.commands.excel2json.models.input_error import DuplicateSheetProblem
|
|
12
|
+
from dsp_tools.commands.excel2json.models.input_error import DuplicatesInColumnProblem
|
|
13
|
+
from dsp_tools.commands.excel2json.models.input_error import ExcelFileProblem
|
|
14
|
+
from dsp_tools.commands.excel2json.models.input_error import InvalidPermissionsOverrule
|
|
15
|
+
from dsp_tools.commands.excel2json.models.input_error import InvalidPermissionsOverruleProblem
|
|
16
|
+
from dsp_tools.commands.excel2json.models.input_error import InvalidSheetNameProblem
|
|
17
|
+
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
|
|
18
|
+
from dsp_tools.commands.excel2json.models.input_error import RequiredColumnMissingProblem
|
|
19
|
+
from dsp_tools.commands.excel2json.models.ontology import LanguageDict
|
|
20
|
+
from dsp_tools.error.exceptions import InputError
|
|
21
|
+
from dsp_tools.error.exceptions import UserFilepathNotFoundError
|
|
22
|
+
|
|
23
|
+
languages = ["en", "de", "fr", "it", "rm"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def read_and_clean_all_sheets(excelfile: str | Path) -> dict[str, pd.DataFrame]:
|
|
27
|
+
"""
|
|
28
|
+
This function reads an Excel file with all its sheets.
|
|
29
|
+
If there is a ValueError, it patches the openpyxl part that causes the error
|
|
30
|
+
and opens it with that patch.
|
|
31
|
+
It cleans the dataframes and then returns them in the form {sheet_name: dataframe}.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
excelfile: path to the Excel file
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
All sheets of the excel file, in the form of a dictionary {sheet_name: dataframe}
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
InputError: If the sheets are not correctly named
|
|
41
|
+
"""
|
|
42
|
+
if not Path(excelfile).exists():
|
|
43
|
+
raise UserFilepathNotFoundError(excelfile)
|
|
44
|
+
try:
|
|
45
|
+
df_dict = pd.read_excel(excelfile, sheet_name=None)
|
|
46
|
+
except ValueError:
|
|
47
|
+
# Pandas relies on openpyxl to parse XLSX files.
|
|
48
|
+
# A strange behavior of openpyxl prevents pandas from opening files with some formatting properties
|
|
49
|
+
# (unclear which formatting properties exactly).
|
|
50
|
+
# Apparently, the excel2json test files have one of the unsupported formatting properties.
|
|
51
|
+
# Credits: https://stackoverflow.com/a/70537454/14414188
|
|
52
|
+
with mock.patch("openpyxl.styles.fonts.Font.family.max", new=100):
|
|
53
|
+
df_dict = pd.read_excel(excelfile, sheet_name=None)
|
|
54
|
+
_find_duplicate_col_names(str(excelfile), list(df_dict))
|
|
55
|
+
try:
|
|
56
|
+
return {name.strip(""): clean_data_frame(df) for name, df in df_dict.items()}
|
|
57
|
+
except AttributeError:
|
|
58
|
+
msg = InvalidSheetNameProblem(str(excelfile), list(df_dict.keys())).execute_error_protocol()
|
|
59
|
+
raise InputError(msg) from None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _find_duplicate_col_names(excelfile: str, col_names: list[str]) -> None:
|
|
63
|
+
sheet_names = [str(x).lower().strip() for x in col_names]
|
|
64
|
+
duplicate_names = list({x for x in sheet_names if sheet_names.count(x) > 1})
|
|
65
|
+
if duplicate_names:
|
|
66
|
+
msg = ExcelFileProblem(str(excelfile), [DuplicateSheetProblem(duplicate_names)]).execute_error_protocol()
|
|
67
|
+
raise InputError(msg) from None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def clean_data_frame(df: pd.DataFrame) -> pd.DataFrame:
|
|
71
|
+
"""
|
|
72
|
+
This function takes a pd.DataFrame and removes:
|
|
73
|
+
- Leading and trailing spaces from the column names
|
|
74
|
+
- Leading and trailing spaces from each cell and any characters in the cells that are not part of any known
|
|
75
|
+
language, for example, linebreaks and replaces it with a pd.NA.
|
|
76
|
+
- Removes all rows that are empty in all columns
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
df: The pd.DataFrame that is to be cleaned
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
pd.DataFrame which has the above-mentioned removed
|
|
83
|
+
"""
|
|
84
|
+
# Remove leading and trailing blanks in column names and make them lower case
|
|
85
|
+
df = df.rename(columns=lambda x: x.strip().lower())
|
|
86
|
+
# Remove the values of all cells that do not at least contain one character of any known language and removes
|
|
87
|
+
# leading and trailing spaces.
|
|
88
|
+
df = df.map(
|
|
89
|
+
lambda x: str(x).strip() if pd.notna(x) and regex.search(r"[\w\p{L}]", str(x), flags=regex.U) else pd.NA
|
|
90
|
+
)
|
|
91
|
+
# drop all the rows that are entirely empty
|
|
92
|
+
df = df.dropna(axis=0, how="all")
|
|
93
|
+
return df
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def check_contains_required_columns(
|
|
97
|
+
df: pd.DataFrame, required_columns: set[str]
|
|
98
|
+
) -> None | RequiredColumnMissingProblem:
|
|
99
|
+
"""
|
|
100
|
+
This function checks if all the columns from the set are in the pd.DataFrame.
|
|
101
|
+
Additional columns to the ones in the set are allowed.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
df: pd.DataFrame that is checked
|
|
105
|
+
required_columns: set of column names
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
An object if there is a problem else None.
|
|
109
|
+
"""
|
|
110
|
+
if not required_columns.issubset(set(df.columns)):
|
|
111
|
+
required = list(required_columns.difference(set(df.columns)))
|
|
112
|
+
return RequiredColumnMissingProblem(columns=required)
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def check_column_for_duplicate(df: pd.DataFrame, to_check_column: str) -> None | DuplicatesInColumnProblem:
|
|
117
|
+
"""
|
|
118
|
+
This function checks if a specified column contains duplicate values.
|
|
119
|
+
Empty cells (pd.NA) also count as duplicates.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
df: pd.DataFrame that is checked for duplicates
|
|
123
|
+
to_check_column: Name of the column that must not contain duplicates
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
If there are problems it returns an object that stores the relevant user information.
|
|
127
|
+
|
|
128
|
+
"""
|
|
129
|
+
if df[to_check_column].duplicated().any():
|
|
130
|
+
duplicate_values = df[to_check_column][df[to_check_column].duplicated()].tolist()
|
|
131
|
+
return DuplicatesInColumnProblem(
|
|
132
|
+
column=to_check_column,
|
|
133
|
+
duplicate_values=duplicate_values,
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def find_missing_required_values(
|
|
140
|
+
df: pd.DataFrame, required_values_columns: list[str], sheetname: str | None = None
|
|
141
|
+
) -> list[PositionInExcel] | None:
|
|
142
|
+
"""
|
|
143
|
+
If there are empty cells in the specified columns,
|
|
144
|
+
It specifies the column and row numbers of all missing values and
|
|
145
|
+
returns all the locations wrapped in the MissingValuesProblem
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
df: pd.DataFrame that is checked
|
|
149
|
+
required_values_columns: a list of column names that may not contain empty cells
|
|
150
|
+
sheetname: optional name of the Excel sheet
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Locations of missing values
|
|
154
|
+
None if all are filled
|
|
155
|
+
"""
|
|
156
|
+
if missing_values := check_required_values(df, required_values_columns):
|
|
157
|
+
locations = []
|
|
158
|
+
row_nums = get_wrong_row_numbers(missing_values)
|
|
159
|
+
for col, nums in row_nums.items():
|
|
160
|
+
locations.extend([PositionInExcel(sheet=sheetname, column=col, row=x) for x in nums])
|
|
161
|
+
return locations
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def check_required_values(df: pd.DataFrame, required_values_columns: list[str]) -> dict[str, pd.Series[bool]]:
|
|
166
|
+
"""
|
|
167
|
+
If there are any empty cells in the column, it adds the column name and a boolean pd.Series to the dictionary.
|
|
168
|
+
If there are no empty cells, then it is not included in the dictionary.
|
|
169
|
+
If no column has any empty cells, then it returns an empty dictionary.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
df: pd.DataFrame that is checked
|
|
173
|
+
required_values_columns: a list of column names that may not contain empty cells
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
a dictionary with the column names as key and pd.Series as values if there are any empty cells
|
|
177
|
+
"""
|
|
178
|
+
# It checks if any of the values in a specified column are empty. If they are, they are added to the dictionary
|
|
179
|
+
# with the column name as key and a boolean series as value that contain true for every pd.NA
|
|
180
|
+
# If all the columns are filled, then it returns an empty dictionary.
|
|
181
|
+
return {col: df[col].isnull() for col in required_values_columns if df[col].isnull().any()}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _turn_bool_array_into_index_numbers(series: pd.Series[bool], true_remains: bool = True) -> list[int]:
|
|
185
|
+
"""
|
|
186
|
+
This function takes a pd.Series containing boolean values.
|
|
187
|
+
By default, this method extracts the index numbers of the True values.
|
|
188
|
+
If the index numbers of the False values are required, the parameter "true_remains" should be turned to False.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
series: pd.Series, which only contains True and False values
|
|
192
|
+
true_remains: True if the index numbers of True are required, likewise with False
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
A list of index numbers
|
|
196
|
+
"""
|
|
197
|
+
# If the False are required, we need to invert the array.
|
|
198
|
+
if not true_remains:
|
|
199
|
+
series = ~series
|
|
200
|
+
return list(series[series].index)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def get_wrong_row_numbers(
|
|
204
|
+
wrong_row_dict: dict[str, pd.Series[bool]], true_remains: bool = True
|
|
205
|
+
) -> dict[str, list[int]]:
|
|
206
|
+
"""
|
|
207
|
+
From the boolean pd.Series the index numbers of the True values are extracted.
|
|
208
|
+
The resulting list is the new value of the dictionary.
|
|
209
|
+
This new dictionary is taken and to each index number 2 is added, so that it corresponds to the Excel row number.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
wrong_row_dict: The dictionary which contains column names and a boolean pd.Series
|
|
213
|
+
true_remains: If True then the index of True is taken, if False then the index of False values is taken
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Dictionary with the column name as key and the row number as a list.
|
|
217
|
+
"""
|
|
218
|
+
wrong_row_int_dict = {
|
|
219
|
+
k: _turn_bool_array_into_index_numbers(series=v, true_remains=true_remains) for k, v in wrong_row_dict.items()
|
|
220
|
+
}
|
|
221
|
+
return {k: [x + 2 for x in v] for k, v in wrong_row_int_dict.items()}
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def get_labels(df_row: pd.Series[Any]) -> LanguageDict:
|
|
225
|
+
"""
|
|
226
|
+
This function takes a pd.Series which has "label_[language tag]" in the index.
|
|
227
|
+
If the value of the index is not pd.NA, the language tag and the value are added to a dictionary.
|
|
228
|
+
If it is empty, it is omitted from the dictionary.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
df_row: a pd.Series (usually a row of a pd.DataFrame) from which the content of the columns containing the
|
|
232
|
+
label is extracted
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
A dictionary with the language tag and the content of the cell
|
|
236
|
+
"""
|
|
237
|
+
labels = {lang: label for lang in languages if not pd.isna(label := df_row[f"label_{lang}"])}
|
|
238
|
+
if not labels:
|
|
239
|
+
labels = {lang: label for lang in languages if not pd.isna(label := df_row[lang])}
|
|
240
|
+
return LanguageDict(labels)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def get_comments(df_row: pd.Series[Any]) -> LanguageDict | None:
|
|
244
|
+
"""
|
|
245
|
+
This function takes a pd.Series which has "comment_[language tag]" in the index.
|
|
246
|
+
If the value of the index is not pd.NA, the language tag and the value are added to a dictionary.
|
|
247
|
+
If it is empty, it is omitted from the dictionary.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
df_row: a pd.Series (usually a row of a pd.DataFrame) from which the content of the columns containing the
|
|
251
|
+
comment is extracted
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
A dictionary with the language tag and the content of the cell
|
|
255
|
+
"""
|
|
256
|
+
comments = {lang: comment for lang in languages if not pd.isna(comment := df_row[f"comment_{lang}"])}
|
|
257
|
+
return LanguageDict(comments) if comments else None
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def find_one_full_cell_in_cols(df: pd.DataFrame, required_columns: list[str]) -> pd.Series[bool] | None:
|
|
261
|
+
"""
|
|
262
|
+
This function takes a pd.DataFrame and a list of column names where at least one cell must have a value per row.
|
|
263
|
+
A pd.Series with boolean values is returned, True if any rows do not have a value in at least one column
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
df: The pd.DataFrame which should be checked
|
|
267
|
+
required_columns: A list of column names where at least one cell per row must have a value
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
None if there is no problem or a pd.Series if there is a problem in a row
|
|
271
|
+
"""
|
|
272
|
+
# The series has True if the cell is empty
|
|
273
|
+
# In order to combine more than two arrays, we need to reduce the arrays, which takes a tuple
|
|
274
|
+
result_arrays = tuple(df[col].isnull() for col in required_columns)
|
|
275
|
+
# If all are True logical_and returns True otherwise False
|
|
276
|
+
combined_array = np.logical_and.reduce(result_arrays)
|
|
277
|
+
# if any of the values are True, it is turned into a pd.Series
|
|
278
|
+
return pd.Series(combined_array) if any(combined_array) else None
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def col_must_or_not_empty_based_on_other_col(
|
|
282
|
+
df: pd.DataFrame,
|
|
283
|
+
substring_list: list[str],
|
|
284
|
+
substring_colname: str,
|
|
285
|
+
check_empty_colname: str,
|
|
286
|
+
must_have_value: bool,
|
|
287
|
+
) -> pd.Series[bool] | None:
|
|
288
|
+
"""
|
|
289
|
+
It is presumed that the column "substring_colname" has no empty cells.
|
|
290
|
+
Based on the string content of the individual rows, which is specified in the "substring_list",
|
|
291
|
+
the cell in the column "check_empty_colname" is checked whether it is empty or not.
|
|
292
|
+
The "substring_list" contains the different possibilities regarding the content of the cell.
|
|
293
|
+
If the parameter "must_have_value" is True, then the cell in the "check_empty_colname" column must not be empty.
|
|
294
|
+
If the parameter is set to False, then it must be empty.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
df: The pd.DataFrame which is checked
|
|
298
|
+
substring_list: A list of possible information that could be in the column "substring_colname"
|
|
299
|
+
substring_colname: The name of the column that may contain any of the sub-strings
|
|
300
|
+
check_empty_colname: The name of the column which is checked if it is empty or not
|
|
301
|
+
must_have_value: True if the "check_empty_colname" should have a value or the reverse.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
None if all rows are correctly filled or empty.
|
|
305
|
+
A series which contains True values for the rows, where it does
|
|
306
|
+
not comply with the specifications.
|
|
307
|
+
"""
|
|
308
|
+
na_series = df[check_empty_colname].isna()
|
|
309
|
+
# If the cells have to be empty, we need to reverse the series
|
|
310
|
+
if not must_have_value:
|
|
311
|
+
na_series = ~na_series
|
|
312
|
+
# This returns True if it finds the substring in the cell, they are joined in a RegEx "|" which denotes "or".
|
|
313
|
+
# If it does not match any of the sub-strings, then the RegEx returns False,
|
|
314
|
+
# which means that the value in the column "check_empty_colname" is not relevant.
|
|
315
|
+
substring_array = df[substring_colname].str.contains("|".join(substring_list), na=False, regex=True)
|
|
316
|
+
# If both are True logical_and returns True otherwise False
|
|
317
|
+
combined_array = np.logical_and(na_series, substring_array)
|
|
318
|
+
return pd.Series(combined_array) if any(combined_array) else None
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def add_optional_columns(df: pd.DataFrame, optional_col_set: set[str]) -> pd.DataFrame:
|
|
322
|
+
"""
|
|
323
|
+
Adds columns to a df if they are not already present.
|
|
324
|
+
The content of the columns is empty.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
df: Dataframe
|
|
328
|
+
optional_col_set: set of columns that may be added
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
Dataframe with additional columns if they were not present
|
|
332
|
+
"""
|
|
333
|
+
in_df_cols = set(df.columns)
|
|
334
|
+
if not optional_col_set.issubset(in_df_cols):
|
|
335
|
+
additional_col = list(optional_col_set.difference(in_df_cols))
|
|
336
|
+
additional_df = pd.DataFrame(columns=additional_col, index=df.index)
|
|
337
|
+
df = pd.concat(objs=[df, additional_df], axis=1)
|
|
338
|
+
return df
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def check_permissions(df: pd.DataFrame, allowed_vals: list[str]) -> None | InvalidPermissionsOverruleProblem:
|
|
342
|
+
problems: list[InvalidPermissionsOverrule] = []
|
|
343
|
+
for _, row in df.iterrows():
|
|
344
|
+
if pd.isna(actual_val := row.get("default_permissions_overrule")):
|
|
345
|
+
continue
|
|
346
|
+
if actual_val.strip().lower() not in allowed_vals:
|
|
347
|
+
prob = InvalidPermissionsOverrule(entity_name=row["name"], actual_val=actual_val, allowed_vals=allowed_vals)
|
|
348
|
+
problems.append(prob)
|
|
349
|
+
if problems:
|
|
350
|
+
return InvalidPermissionsOverruleProblem(problems)
|
|
351
|
+
else:
|
|
352
|
+
return None
|