dsp-tools 0.9.13__py3-none-any.whl → 18.3.0.post13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsp_tools/__init__.py +5 -0
- dsp_tools/cli/args.py +47 -0
- dsp_tools/cli/call_action.py +85 -0
- dsp_tools/cli/call_action_files_only.py +101 -0
- dsp_tools/cli/call_action_with_network.py +207 -0
- dsp_tools/cli/create_parsers.py +479 -0
- dsp_tools/cli/entry_point.py +322 -0
- dsp_tools/cli/utils.py +87 -0
- dsp_tools/clients/CLAUDE.md +420 -0
- dsp_tools/clients/authentication_client.py +14 -0
- dsp_tools/clients/authentication_client_live.py +66 -0
- dsp_tools/clients/connection.py +35 -0
- dsp_tools/clients/connection_live.py +233 -0
- dsp_tools/clients/fuseki_metrics.py +60 -0
- dsp_tools/clients/group_user_clients.py +35 -0
- dsp_tools/clients/group_user_clients_live.py +181 -0
- dsp_tools/clients/legal_info_client.py +23 -0
- dsp_tools/clients/legal_info_client_live.py +132 -0
- dsp_tools/clients/list_client.py +49 -0
- dsp_tools/clients/list_client_live.py +166 -0
- dsp_tools/clients/metadata_client.py +24 -0
- dsp_tools/clients/metadata_client_live.py +47 -0
- dsp_tools/clients/ontology_clients.py +49 -0
- dsp_tools/clients/ontology_create_client_live.py +166 -0
- dsp_tools/clients/ontology_get_client_live.py +80 -0
- dsp_tools/clients/permissions_client.py +68 -0
- dsp_tools/clients/project_client.py +16 -0
- dsp_tools/clients/project_client_live.py +66 -0
- dsp_tools/commands/create/communicate_problems.py +24 -0
- dsp_tools/commands/create/create.py +134 -0
- dsp_tools/commands/create/create_on_server/cardinalities.py +111 -0
- dsp_tools/commands/create/create_on_server/classes.py +99 -0
- dsp_tools/commands/create/create_on_server/complete_ontologies.py +116 -0
- dsp_tools/commands/create/create_on_server/default_permissions.py +134 -0
- dsp_tools/commands/create/create_on_server/group_users.py +165 -0
- dsp_tools/commands/create/create_on_server/lists.py +163 -0
- dsp_tools/commands/create/create_on_server/mappers.py +12 -0
- dsp_tools/commands/create/create_on_server/onto_utils.py +74 -0
- dsp_tools/commands/create/create_on_server/ontology.py +52 -0
- dsp_tools/commands/create/create_on_server/project.py +68 -0
- dsp_tools/commands/create/create_on_server/properties.py +119 -0
- dsp_tools/commands/create/exceptions.py +29 -0
- dsp_tools/commands/create/lists_only.py +66 -0
- dsp_tools/commands/create/models/create_problems.py +87 -0
- dsp_tools/commands/create/models/parsed_ontology.py +88 -0
- dsp_tools/commands/create/models/parsed_project.py +81 -0
- dsp_tools/commands/create/models/rdf_ontology.py +12 -0
- dsp_tools/commands/create/models/server_project_info.py +100 -0
- dsp_tools/commands/create/parsing/parse_lists.py +45 -0
- dsp_tools/commands/create/parsing/parse_ontology.py +243 -0
- dsp_tools/commands/create/parsing/parse_project.py +149 -0
- dsp_tools/commands/create/parsing/parsing_utils.py +40 -0
- dsp_tools/commands/create/project_validate.py +595 -0
- dsp_tools/commands/create/serialisation/ontology.py +119 -0
- dsp_tools/commands/create/serialisation/project.py +44 -0
- dsp_tools/commands/excel2json/CLAUDE.md +101 -0
- dsp_tools/commands/excel2json/json_header.py +321 -0
- dsp_tools/commands/excel2json/lists/__init__.py +0 -0
- dsp_tools/commands/excel2json/lists/compliance_checks.py +292 -0
- dsp_tools/commands/excel2json/lists/make_lists.py +247 -0
- dsp_tools/commands/excel2json/lists/models/__init__.py +0 -0
- dsp_tools/commands/excel2json/lists/models/deserialise.py +30 -0
- dsp_tools/commands/excel2json/lists/models/input_error.py +216 -0
- dsp_tools/commands/excel2json/lists/models/serialise.py +57 -0
- dsp_tools/commands/excel2json/lists/utils.py +81 -0
- dsp_tools/commands/excel2json/models/__init__.py +0 -0
- dsp_tools/commands/excel2json/models/input_error.py +416 -0
- dsp_tools/commands/excel2json/models/json_header.py +175 -0
- dsp_tools/commands/excel2json/models/list_node_name.py +16 -0
- dsp_tools/commands/excel2json/models/ontology.py +76 -0
- dsp_tools/commands/excel2json/old_lists.py +328 -0
- dsp_tools/commands/excel2json/project.py +280 -0
- dsp_tools/commands/excel2json/properties.py +370 -0
- dsp_tools/commands/excel2json/resources.py +336 -0
- dsp_tools/commands/excel2json/utils.py +352 -0
- dsp_tools/commands/excel2xml/__init__.py +7 -0
- dsp_tools/commands/excel2xml/excel2xml_cli.py +523 -0
- dsp_tools/commands/excel2xml/excel2xml_lib.py +1953 -0
- dsp_tools/commands/excel2xml/propertyelement.py +47 -0
- dsp_tools/commands/get/__init__.py +0 -0
- dsp_tools/commands/get/get.py +166 -0
- dsp_tools/commands/get/get_permissions.py +257 -0
- dsp_tools/commands/get/get_permissions_legacy.py +89 -0
- dsp_tools/commands/get/legacy_models/__init__.py +0 -0
- dsp_tools/commands/get/legacy_models/context.py +318 -0
- dsp_tools/commands/get/legacy_models/group.py +241 -0
- dsp_tools/commands/get/legacy_models/helpers.py +47 -0
- dsp_tools/commands/get/legacy_models/listnode.py +390 -0
- dsp_tools/commands/get/legacy_models/model.py +12 -0
- dsp_tools/commands/get/legacy_models/ontology.py +324 -0
- dsp_tools/commands/get/legacy_models/project.py +366 -0
- dsp_tools/commands/get/legacy_models/propertyclass.py +417 -0
- dsp_tools/commands/get/legacy_models/resourceclass.py +676 -0
- dsp_tools/commands/get/legacy_models/user.py +438 -0
- dsp_tools/commands/get/models/__init__.py +0 -0
- dsp_tools/commands/get/models/permissions_models.py +10 -0
- dsp_tools/commands/id2iri.py +258 -0
- dsp_tools/commands/ingest_xmlupload/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/bulk_ingest_client.py +178 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/apply_ingest_id.py +69 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/upload_xml.py +166 -0
- dsp_tools/commands/ingest_xmlupload/create_resources/user_information.py +121 -0
- dsp_tools/commands/ingest_xmlupload/ingest_files/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/ingest_files/ingest_files.py +64 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/__init__.py +0 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/filechecker.py +20 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py +57 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_failures.py +66 -0
- dsp_tools/commands/ingest_xmlupload/upload_files/upload_files.py +67 -0
- dsp_tools/commands/resume_xmlupload/__init__.py +0 -0
- dsp_tools/commands/resume_xmlupload/resume_xmlupload.py +96 -0
- dsp_tools/commands/start_stack.py +428 -0
- dsp_tools/commands/update_legal/CLAUDE.md +344 -0
- dsp_tools/commands/update_legal/__init__.py +0 -0
- dsp_tools/commands/update_legal/core.py +182 -0
- dsp_tools/commands/update_legal/csv_operations.py +135 -0
- dsp_tools/commands/update_legal/models.py +87 -0
- dsp_tools/commands/update_legal/xml_operations.py +247 -0
- dsp_tools/commands/validate_data/CLAUDE.md +159 -0
- dsp_tools/commands/validate_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/constants.py +59 -0
- dsp_tools/commands/validate_data/mappers.py +143 -0
- dsp_tools/commands/validate_data/models/__init__.py +0 -0
- dsp_tools/commands/validate_data/models/api_responses.py +45 -0
- dsp_tools/commands/validate_data/models/input_problems.py +119 -0
- dsp_tools/commands/validate_data/models/rdf_like_data.py +117 -0
- dsp_tools/commands/validate_data/models/validation.py +106 -0
- dsp_tools/commands/validate_data/prepare_data/__init__.py +0 -0
- dsp_tools/commands/validate_data/prepare_data/get_rdf_like_data.py +296 -0
- dsp_tools/commands/validate_data/prepare_data/make_data_graph.py +91 -0
- dsp_tools/commands/validate_data/prepare_data/prepare_data.py +184 -0
- dsp_tools/commands/validate_data/process_validation_report/__init__.py +0 -0
- dsp_tools/commands/validate_data/process_validation_report/get_user_validation_message.py +358 -0
- dsp_tools/commands/validate_data/process_validation_report/query_validation_result.py +507 -0
- dsp_tools/commands/validate_data/process_validation_report/reformat_validation_results.py +150 -0
- dsp_tools/commands/validate_data/shacl_cli_validator.py +70 -0
- dsp_tools/commands/validate_data/sparql/__init__.py +0 -0
- dsp_tools/commands/validate_data/sparql/cardinality_shacl.py +209 -0
- dsp_tools/commands/validate_data/sparql/construct_shacl.py +92 -0
- dsp_tools/commands/validate_data/sparql/legal_info_shacl.py +36 -0
- dsp_tools/commands/validate_data/sparql/value_shacl.py +357 -0
- dsp_tools/commands/validate_data/utils.py +59 -0
- dsp_tools/commands/validate_data/validate_data.py +283 -0
- dsp_tools/commands/validate_data/validation/__init__.py +0 -0
- dsp_tools/commands/validate_data/validation/check_duplicate_files.py +55 -0
- dsp_tools/commands/validate_data/validation/check_for_unknown_classes.py +67 -0
- dsp_tools/commands/validate_data/validation/get_validation_report.py +94 -0
- dsp_tools/commands/validate_data/validation/validate_ontology.py +107 -0
- dsp_tools/commands/xmlupload/CLAUDE.md +292 -0
- dsp_tools/commands/xmlupload/__init__.py +0 -0
- dsp_tools/commands/xmlupload/iri_resolver.py +21 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/__init__.py +0 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/constants.py +63 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/jsonld_utils.py +44 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_file_value.py +77 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_resource_and_values.py +114 -0
- dsp_tools/commands/xmlupload/make_rdf_graph/make_values.py +262 -0
- dsp_tools/commands/xmlupload/models/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/bitstream_info.py +18 -0
- dsp_tools/commands/xmlupload/models/formatted_text_value.py +10 -0
- dsp_tools/commands/xmlupload/models/ingest.py +143 -0
- dsp_tools/commands/xmlupload/models/input_problems.py +58 -0
- dsp_tools/commands/xmlupload/models/lookup_models.py +21 -0
- dsp_tools/commands/xmlupload/models/permission.py +45 -0
- dsp_tools/commands/xmlupload/models/permissions_parsed.py +93 -0
- dsp_tools/commands/xmlupload/models/processed/__init__.py +0 -0
- dsp_tools/commands/xmlupload/models/processed/file_values.py +29 -0
- dsp_tools/commands/xmlupload/models/processed/res.py +27 -0
- dsp_tools/commands/xmlupload/models/processed/values.py +101 -0
- dsp_tools/commands/xmlupload/models/rdf_models.py +26 -0
- dsp_tools/commands/xmlupload/models/upload_clients.py +14 -0
- dsp_tools/commands/xmlupload/models/upload_state.py +20 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/__init__.py +0 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/ark2iri.py +55 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/get_processed_resources.py +252 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/iiif_uri_validator.py +50 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/list_client.py +120 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/prepare_xml_input.py +67 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/read_validate_xml_file.py +58 -0
- dsp_tools/commands/xmlupload/prepare_xml_input/transform_input_values.py +118 -0
- dsp_tools/commands/xmlupload/resource_create_client.py +25 -0
- dsp_tools/commands/xmlupload/richtext_id2iri.py +37 -0
- dsp_tools/commands/xmlupload/stash/__init__.py +0 -0
- dsp_tools/commands/xmlupload/stash/analyse_circular_reference_graph.py +236 -0
- dsp_tools/commands/xmlupload/stash/create_info_for_graph.py +53 -0
- dsp_tools/commands/xmlupload/stash/graph_models.py +87 -0
- dsp_tools/commands/xmlupload/stash/stash_circular_references.py +68 -0
- dsp_tools/commands/xmlupload/stash/stash_models.py +109 -0
- dsp_tools/commands/xmlupload/stash/upload_stashed_resptr_props.py +106 -0
- dsp_tools/commands/xmlupload/stash/upload_stashed_xml_texts.py +196 -0
- dsp_tools/commands/xmlupload/upload_config.py +76 -0
- dsp_tools/commands/xmlupload/write_diagnostic_info.py +27 -0
- dsp_tools/commands/xmlupload/xmlupload.py +516 -0
- dsp_tools/config/__init__.py +0 -0
- dsp_tools/config/logger_config.py +69 -0
- dsp_tools/config/warnings_config.py +32 -0
- dsp_tools/error/__init__.py +0 -0
- dsp_tools/error/custom_warnings.py +39 -0
- dsp_tools/error/exceptions.py +204 -0
- dsp_tools/error/problems.py +10 -0
- dsp_tools/error/xmllib_errors.py +20 -0
- dsp_tools/error/xmllib_warnings.py +54 -0
- dsp_tools/error/xmllib_warnings_util.py +159 -0
- dsp_tools/error/xsd_validation_error_msg.py +19 -0
- dsp_tools/legacy_models/__init__.py +0 -0
- dsp_tools/legacy_models/datetimestamp.py +81 -0
- dsp_tools/legacy_models/langstring.py +253 -0
- dsp_tools/legacy_models/projectContext.py +49 -0
- dsp_tools/py.typed +0 -0
- dsp_tools/resources/schema/data.xsd +648 -0
- dsp_tools/resources/schema/lists-only.json +72 -0
- dsp_tools/resources/schema/project.json +1258 -0
- dsp_tools/resources/schema/properties-only.json +874 -0
- dsp_tools/resources/schema/resources-only.json +140 -0
- dsp_tools/resources/start-stack/docker-compose.override-host.j2 +11 -0
- dsp_tools/resources/start-stack/docker-compose.override.yml +11 -0
- dsp_tools/resources/start-stack/docker-compose.yml +88 -0
- dsp_tools/resources/start-stack/dsp-app-config.json +45 -0
- dsp_tools/resources/start-stack/dsp-app-config.override-host.j2 +26 -0
- dsp_tools/resources/validate_data/api-shapes-resource-cardinalities.ttl +191 -0
- dsp_tools/resources/validate_data/api-shapes.ttl +804 -0
- dsp_tools/resources/validate_data/shacl-cli-image.yml +4 -0
- dsp_tools/resources/validate_data/validate-ontology.ttl +99 -0
- dsp_tools/utils/__init__.py +0 -0
- dsp_tools/utils/ansi_colors.py +32 -0
- dsp_tools/utils/data_formats/__init__.py +0 -0
- dsp_tools/utils/data_formats/date_util.py +166 -0
- dsp_tools/utils/data_formats/iri_util.py +30 -0
- dsp_tools/utils/data_formats/shared.py +81 -0
- dsp_tools/utils/data_formats/uri_util.py +76 -0
- dsp_tools/utils/fuseki_bloating.py +63 -0
- dsp_tools/utils/json_parsing.py +22 -0
- dsp_tools/utils/rdf_constants.py +42 -0
- dsp_tools/utils/rdflib_utils.py +10 -0
- dsp_tools/utils/replace_id_with_iri.py +66 -0
- dsp_tools/utils/request_utils.py +238 -0
- dsp_tools/utils/xml_parsing/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/get_lookups.py +32 -0
- dsp_tools/utils/xml_parsing/get_parsed_resources.py +325 -0
- dsp_tools/utils/xml_parsing/models/__init__.py +0 -0
- dsp_tools/utils/xml_parsing/models/parsed_resource.py +76 -0
- dsp_tools/utils/xml_parsing/parse_clean_validate_xml.py +137 -0
- dsp_tools/xmllib/CLAUDE.md +302 -0
- dsp_tools/xmllib/__init__.py +49 -0
- dsp_tools/xmllib/general_functions.py +877 -0
- dsp_tools/xmllib/internal/__init__.py +0 -0
- dsp_tools/xmllib/internal/checkers.py +162 -0
- dsp_tools/xmllib/internal/circumvent_circular_imports.py +36 -0
- dsp_tools/xmllib/internal/constants.py +46 -0
- dsp_tools/xmllib/internal/input_converters.py +155 -0
- dsp_tools/xmllib/internal/serialise_file_value.py +57 -0
- dsp_tools/xmllib/internal/serialise_resource.py +177 -0
- dsp_tools/xmllib/internal/serialise_values.py +152 -0
- dsp_tools/xmllib/internal/type_aliases.py +11 -0
- dsp_tools/xmllib/models/__init__.py +0 -0
- dsp_tools/xmllib/models/config_options.py +28 -0
- dsp_tools/xmllib/models/date_formats.py +48 -0
- dsp_tools/xmllib/models/dsp_base_resources.py +1542 -0
- dsp_tools/xmllib/models/internal/__init__.py +0 -0
- dsp_tools/xmllib/models/internal/file_values.py +172 -0
- dsp_tools/xmllib/models/internal/geometry.py +162 -0
- dsp_tools/xmllib/models/internal/migration_metadata.py +55 -0
- dsp_tools/xmllib/models/internal/serialise_permissions.py +66 -0
- dsp_tools/xmllib/models/internal/values.py +342 -0
- dsp_tools/xmllib/models/licenses/__init__.py +0 -0
- dsp_tools/xmllib/models/licenses/other.py +59 -0
- dsp_tools/xmllib/models/licenses/recommended.py +107 -0
- dsp_tools/xmllib/models/permissions.py +41 -0
- dsp_tools/xmllib/models/res.py +1782 -0
- dsp_tools/xmllib/models/root.py +348 -0
- dsp_tools/xmllib/value_checkers.py +434 -0
- dsp_tools/xmllib/value_converters.py +777 -0
- dsp_tools-18.3.0.post13.dist-info/METADATA +90 -0
- dsp_tools-18.3.0.post13.dist-info/RECORD +286 -0
- dsp_tools-18.3.0.post13.dist-info/WHEEL +4 -0
- dsp_tools-18.3.0.post13.dist-info/entry_points.txt +3 -0
- dsp_tools-0.9.13.dist-info/LICENSE +0 -674
- dsp_tools-0.9.13.dist-info/METADATA +0 -144
- dsp_tools-0.9.13.dist-info/RECORD +0 -71
- dsp_tools-0.9.13.dist-info/WHEEL +0 -5
- dsp_tools-0.9.13.dist-info/entry_points.txt +0 -3
- dsp_tools-0.9.13.dist-info/top_level.txt +0 -1
- dsplib/models/connection.py +0 -272
- dsplib/models/group.py +0 -296
- dsplib/models/helpers.py +0 -505
- dsplib/models/langstring.py +0 -277
- dsplib/models/listnode.py +0 -578
- dsplib/models/model.py +0 -20
- dsplib/models/ontology.py +0 -448
- dsplib/models/permission.py +0 -112
- dsplib/models/project.py +0 -547
- dsplib/models/propertyclass.py +0 -505
- dsplib/models/resource.py +0 -366
- dsplib/models/resourceclass.py +0 -810
- dsplib/models/sipi.py +0 -30
- dsplib/models/user.py +0 -731
- dsplib/models/value.py +0 -1000
- dsplib/utils/knora-data-schema.xsd +0 -454
- dsplib/utils/knora-schema-lists.json +0 -83
- dsplib/utils/knora-schema.json +0 -434
- dsplib/utils/onto_commons.py +0 -24
- dsplib/utils/onto_create_lists.py +0 -73
- dsplib/utils/onto_create_ontology.py +0 -442
- dsplib/utils/onto_get.py +0 -58
- dsplib/utils/onto_validate.py +0 -33
- dsplib/utils/xml_upload.py +0 -539
- dsplib/widgets/doublepassword.py +0 -80
- knora/MLS-import-libraries.py +0 -84
- knora/dsp_tools.py +0 -96
- knora/dsplib/models/connection.py +0 -272
- knora/dsplib/models/group.py +0 -296
- knora/dsplib/models/helpers.py +0 -506
- knora/dsplib/models/langstring.py +0 -277
- knora/dsplib/models/listnode.py +0 -578
- knora/dsplib/models/model.py +0 -20
- knora/dsplib/models/ontology.py +0 -448
- knora/dsplib/models/permission.py +0 -112
- knora/dsplib/models/project.py +0 -583
- knora/dsplib/models/propertyclass.py +0 -505
- knora/dsplib/models/resource.py +0 -416
- knora/dsplib/models/resourceclass.py +0 -811
- knora/dsplib/models/sipi.py +0 -35
- knora/dsplib/models/user.py +0 -731
- knora/dsplib/models/value.py +0 -1000
- knora/dsplib/utils/knora-data-schema.xsd +0 -464
- knora/dsplib/utils/knora-schema-lists.json +0 -83
- knora/dsplib/utils/knora-schema.json +0 -444
- knora/dsplib/utils/onto_commons.py +0 -24
- knora/dsplib/utils/onto_create_lists.py +0 -73
- knora/dsplib/utils/onto_create_ontology.py +0 -451
- knora/dsplib/utils/onto_get.py +0 -58
- knora/dsplib/utils/onto_validate.py +0 -33
- knora/dsplib/utils/xml_upload.py +0 -540
- knora/dsplib/widgets/doublepassword.py +0 -80
- knora/knora.py +0 -2108
- knora/test.py +0 -99
- knora/testit.py +0 -76
- knora/xml2knora.py +0 -633
- {dsplib → dsp_tools/cli}/__init__.py +0 -0
- {dsplib/models → dsp_tools/clients}/__init__.py +0 -0
- {dsplib/utils → dsp_tools/commands}/__init__.py +0 -0
- {dsplib/widgets → dsp_tools/commands/create}/__init__.py +0 -0
- {knora → dsp_tools/commands/create/create_on_server}/__init__.py +0 -0
- {knora/dsplib → dsp_tools/commands/create/models}/__init__.py +0 -0
- {knora/dsplib/models → dsp_tools/commands/create/parsing}/__init__.py +0 -0
- {knora/dsplib/utils → dsp_tools/commands/create/serialisation}/__init__.py +0 -0
- {knora/dsplib/widgets → dsp_tools/commands/excel2json}/__init__.py +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Any
|
|
6
|
+
from typing import cast
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import regex
|
|
10
|
+
from loguru import logger
|
|
11
|
+
|
|
12
|
+
from dsp_tools.commands.excel2json.lists.models.deserialise import Columns
|
|
13
|
+
from dsp_tools.commands.excel2json.lists.models.deserialise import ExcelSheet
|
|
14
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import CollectedSheetProblems
|
|
15
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import DuplicateIDProblem
|
|
16
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import DuplicatesCustomIDInProblem
|
|
17
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import DuplicatesInSheetProblem
|
|
18
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import DuplicatesListNameProblem
|
|
19
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import ListCreationProblem
|
|
20
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import ListInformation
|
|
21
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import ListSheetComplianceProblem
|
|
22
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import ListSheetContentProblem
|
|
23
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import MinimumRowsProblem
|
|
24
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import MissingExpectedColumn
|
|
25
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import MissingNodeColumn
|
|
26
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import MissingNodeTranslationProblem
|
|
27
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import MissingTranslationsSheetProblem
|
|
28
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import MultipleListPerSheetProblem
|
|
29
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import NodesPerRowProblem
|
|
30
|
+
from dsp_tools.commands.excel2json.lists.models.input_error import SheetProblem
|
|
31
|
+
from dsp_tools.commands.excel2json.lists.utils import get_columns_of_preferred_lang
|
|
32
|
+
from dsp_tools.commands.excel2json.lists.utils import get_hierarchy_nums
|
|
33
|
+
from dsp_tools.commands.excel2json.lists.utils import get_lang_string_from_column_name
|
|
34
|
+
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
|
|
35
|
+
from dsp_tools.error.custom_warnings import DspToolsUserWarning
|
|
36
|
+
from dsp_tools.error.exceptions import InputError
|
|
37
|
+
from dsp_tools.error.problems import Problem
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def make_all_excel_compliance_checks(sheet_list: list[ExcelSheet]) -> None:
|
|
41
|
+
"""Check if the excel files are compliant with the expected format."""
|
|
42
|
+
# These functions must be called in this order,
|
|
43
|
+
# as some of the following checks only work if the previous have passed.
|
|
44
|
+
_check_duplicates_all_excels(sheet_list)
|
|
45
|
+
_make_shape_compliance_all_excels(sheet_list)
|
|
46
|
+
_check_for_missing_translations_all_excels(sheet_list)
|
|
47
|
+
_check_for_unique_list_names(sheet_list)
|
|
48
|
+
_check_for_erroneous_entries_all_excels(sheet_list)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _check_duplicates_all_excels(sheet_list: list[ExcelSheet]) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Check if the excel files contain duplicates with regard to the node names,
|
|
54
|
+
and if the custom IDs are unique across all excel files.
|
|
55
|
+
A duplicate in the node names is defined as several rows with the same entries in the columns with the node names.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
sheet_list: class instances representing an excel file with sheets
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
InputError: If any complete duplicates are found in the excel files.
|
|
62
|
+
"""
|
|
63
|
+
problems: list[Problem] = []
|
|
64
|
+
duplicate_problems: list[SheetProblem] = [
|
|
65
|
+
p for sheet in sheet_list if (p := _check_for_duplicate_nodes_one_df(sheet)) is not None
|
|
66
|
+
]
|
|
67
|
+
if duplicate_problems:
|
|
68
|
+
problems.append(CollectedSheetProblems(duplicate_problems))
|
|
69
|
+
if id_problem := _check_for_duplicate_custom_id_all_excels(sheet_list):
|
|
70
|
+
problems.append(id_problem)
|
|
71
|
+
if problems:
|
|
72
|
+
msg = ListCreationProblem(problems).execute_error_protocol()
|
|
73
|
+
logger.error(msg)
|
|
74
|
+
raise InputError(msg)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _check_for_unique_list_names(sheet_list: list[ExcelSheet]) -> None:
|
|
78
|
+
"""This functon checks that one sheet only has one list and that all lists have unique names."""
|
|
79
|
+
list_names: list[ListInformation] = []
|
|
80
|
+
all_problems: list[Problem] = []
|
|
81
|
+
sheet_problems: list[SheetProblem] = []
|
|
82
|
+
for sheet in sheet_list:
|
|
83
|
+
unique_list_names = list(sheet.df[f"{sheet.col_info.preferred_lang}_list"].unique())
|
|
84
|
+
if len(unique_list_names) != 1:
|
|
85
|
+
sheet_problems.append(MultipleListPerSheetProblem(sheet.excel_name, sheet.sheet_name, unique_list_names))
|
|
86
|
+
list_names.extend([ListInformation(sheet.excel_name, sheet.sheet_name, name) for name in unique_list_names])
|
|
87
|
+
if sheet_problems:
|
|
88
|
+
all_problems.append(CollectedSheetProblems(sheet_problems))
|
|
89
|
+
list_info_dict = defaultdict(list)
|
|
90
|
+
for item in list_names:
|
|
91
|
+
list_info_dict[item.list_name].append(item)
|
|
92
|
+
duplicate_list_names = []
|
|
93
|
+
for info in list_info_dict.values():
|
|
94
|
+
if len(info) > 1:
|
|
95
|
+
duplicate_list_names.extend(info)
|
|
96
|
+
if duplicate_list_names:
|
|
97
|
+
all_problems.append(DuplicatesListNameProblem(duplicate_list_names))
|
|
98
|
+
if all_problems:
|
|
99
|
+
msg = ListCreationProblem(all_problems).execute_error_protocol()
|
|
100
|
+
logger.error(msg)
|
|
101
|
+
raise InputError(msg)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _check_for_duplicate_nodes_one_df(sheet: ExcelSheet) -> DuplicatesInSheetProblem | None:
|
|
105
|
+
"""Check if any rows have duplicates when taking into account the columns with the node names."""
|
|
106
|
+
lang_columns = [col for col in sheet.df.columns if regex.search(r"^(en|de|fr|it|rm)_(\d+|list)$", col)]
|
|
107
|
+
if (duplicate_filter := sheet.df.duplicated(lang_columns, keep=False)).any():
|
|
108
|
+
return DuplicatesInSheetProblem(
|
|
109
|
+
sheet.excel_name, sheet.sheet_name, duplicate_filter.index[duplicate_filter].tolist()
|
|
110
|
+
)
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _check_for_duplicate_custom_id_all_excels(sheet_list: list[ExcelSheet]) -> DuplicatesCustomIDInProblem | None:
|
|
115
|
+
id_list = []
|
|
116
|
+
for sheet in sheet_list:
|
|
117
|
+
for i, row in sheet.df.iterrows():
|
|
118
|
+
if not pd.isna(row["id (optional)"]):
|
|
119
|
+
id_list.append(
|
|
120
|
+
{
|
|
121
|
+
"filename": sheet.excel_name,
|
|
122
|
+
"sheet_name": sheet.sheet_name,
|
|
123
|
+
"id": row["id (optional)"],
|
|
124
|
+
"row_number": int(str(i)) + 2,
|
|
125
|
+
}
|
|
126
|
+
)
|
|
127
|
+
id_df = pd.DataFrame.from_records(id_list)
|
|
128
|
+
if (duplicate_ids := id_df.duplicated("id", keep=False)).any():
|
|
129
|
+
problems: dict[str, DuplicateIDProblem] = defaultdict(lambda: DuplicateIDProblem())
|
|
130
|
+
for i, row in id_df[duplicate_ids].iterrows():
|
|
131
|
+
problems[row["id"]].custom_id = row["id"]
|
|
132
|
+
problems[row["id"]].excel_locations.append(
|
|
133
|
+
PositionInExcel(sheet=row["sheet_name"], excel_filename=row["filename"], row=row["row_number"])
|
|
134
|
+
)
|
|
135
|
+
final_problems = list(problems.values())
|
|
136
|
+
return DuplicatesCustomIDInProblem(final_problems)
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _make_shape_compliance_all_excels(sheet_list: list[ExcelSheet]) -> None:
|
|
141
|
+
"""Check if the excel files are compliant with the expected format."""
|
|
142
|
+
problems: list[SheetProblem] = [
|
|
143
|
+
p for sheet in sheet_list if (p := _make_shape_compliance_one_sheet(sheet)) is not None
|
|
144
|
+
]
|
|
145
|
+
if problems:
|
|
146
|
+
msg = ListCreationProblem([CollectedSheetProblems(problems)]).execute_error_protocol()
|
|
147
|
+
logger.error(msg)
|
|
148
|
+
raise InputError(msg)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _make_shape_compliance_one_sheet(sheet: ExcelSheet) -> ListSheetComplianceProblem | None:
|
|
152
|
+
problems: list[Problem] = []
|
|
153
|
+
if len(sheet.df) < 2:
|
|
154
|
+
problems.append(MinimumRowsProblem())
|
|
155
|
+
if not sheet.col_info.node_cols:
|
|
156
|
+
problems.append(MissingNodeColumn())
|
|
157
|
+
if missing := _check_if_all_translations_in_all_column_levels_present_one_sheet(sheet.df.columns):
|
|
158
|
+
problems.append(missing)
|
|
159
|
+
_check_warn_unusual_columns_one_sheet(sheet.df.columns)
|
|
160
|
+
if problems:
|
|
161
|
+
return ListSheetComplianceProblem(sheet.excel_name, sheet.sheet_name, problems)
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _check_warn_unusual_columns_one_sheet(cols: pd.Index[str]) -> None:
|
|
166
|
+
not_matched = [x for x in cols if not regex.search(r"^(en|de|fr|it|rm)_(\d+|list|comments)|(id \(optional\))$", x)]
|
|
167
|
+
if not_matched:
|
|
168
|
+
msg = (
|
|
169
|
+
f"The following columns do not conform to the expected format "
|
|
170
|
+
f"and will not be included in the output: {', '.join(not_matched)}"
|
|
171
|
+
)
|
|
172
|
+
warnings.warn(DspToolsUserWarning(msg))
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _check_if_all_translations_in_all_column_levels_present_one_sheet(
|
|
176
|
+
cols: pd.Index[str],
|
|
177
|
+
) -> MissingExpectedColumn | None:
|
|
178
|
+
# All levels, eg. 1, 2, 3 must have the same languages
|
|
179
|
+
languages = {r for c in cols if (r := get_lang_string_from_column_name(c)) is not None}
|
|
180
|
+
all_nums = [str(n) for n in get_hierarchy_nums(cols)]
|
|
181
|
+
all_nums.append("list")
|
|
182
|
+
|
|
183
|
+
def make_col_names(lang: str) -> set[str]:
|
|
184
|
+
return {f"{lang}_{num}" for num in all_nums}
|
|
185
|
+
|
|
186
|
+
expected_cols = set()
|
|
187
|
+
for lang in languages:
|
|
188
|
+
expected_cols.update(make_col_names(lang))
|
|
189
|
+
if missing_cols := expected_cols - set(cols):
|
|
190
|
+
return MissingExpectedColumn(missing_cols)
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _check_for_missing_translations_all_excels(sheet_list: list[ExcelSheet]) -> None:
|
|
195
|
+
problems: list[SheetProblem] = [
|
|
196
|
+
p for sheet in sheet_list if (p := _check_for_missing_translations_one_sheet(sheet)) is not None
|
|
197
|
+
]
|
|
198
|
+
if problems:
|
|
199
|
+
msg = ListCreationProblem([CollectedSheetProblems(problems)]).execute_error_protocol()
|
|
200
|
+
logger.error(msg)
|
|
201
|
+
raise InputError(msg)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _check_for_missing_translations_one_sheet(sheet: ExcelSheet) -> MissingTranslationsSheetProblem | None:
|
|
205
|
+
problems = []
|
|
206
|
+
for i, row in sheet.df.iterrows():
|
|
207
|
+
if problem := _check_missing_translations_one_row(int(str(i)), row, sheet.col_info):
|
|
208
|
+
problems.append(problem)
|
|
209
|
+
if problems:
|
|
210
|
+
return MissingTranslationsSheetProblem(sheet.excel_name, sheet.sheet_name, problems)
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _check_missing_translations_one_row(
|
|
215
|
+
row_index: int, row: pd.Series[Any], columns: Columns
|
|
216
|
+
) -> MissingNodeTranslationProblem | None:
|
|
217
|
+
missing_translations = []
|
|
218
|
+
for col_group in columns.node_cols:
|
|
219
|
+
missing_translations.extend(_check_for_missing_translations_one_column_group(row, col_group.columns))
|
|
220
|
+
missing_translations.extend(_check_for_missing_translations_one_column_group(row, columns.list_cols))
|
|
221
|
+
if columns.comment_cols:
|
|
222
|
+
missing_translations.extend(_check_for_missing_translations_one_column_group(row, columns.comment_cols))
|
|
223
|
+
if missing_translations:
|
|
224
|
+
return MissingNodeTranslationProblem(empty_columns=missing_translations, index_num=row_index)
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _check_for_missing_translations_one_column_group(row: pd.Series[Any], columns: list[str]) -> list[str]:
|
|
229
|
+
missing = row[columns].isna()
|
|
230
|
+
if missing.any() and not missing.all():
|
|
231
|
+
return [str(index) for index, is_missing in missing.items() if is_missing]
|
|
232
|
+
return []
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _check_for_erroneous_entries_all_excels(sheet_list: list[ExcelSheet]) -> None:
|
|
236
|
+
problems: list[SheetProblem] = [
|
|
237
|
+
p for sheet in sheet_list if (p := _check_for_erroneous_entries_one_list(sheet)) is not None
|
|
238
|
+
]
|
|
239
|
+
if problems:
|
|
240
|
+
msg = ListCreationProblem([CollectedSheetProblems(problems)]).execute_error_protocol()
|
|
241
|
+
logger.error(msg)
|
|
242
|
+
raise InputError(msg)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _check_for_erroneous_entries_one_list(sheet: ExcelSheet) -> ListSheetContentProblem | None:
|
|
246
|
+
preferred_cols = get_columns_of_preferred_lang(sheet.df.columns, sheet.col_info.preferred_lang, r"\d+")
|
|
247
|
+
preferred_cols = sorted(preferred_cols)
|
|
248
|
+
preferred_cols.insert(0, f"{sheet.col_info.preferred_lang}_list")
|
|
249
|
+
problems = _check_for_erroneous_node_info_one_df(sheet.df, preferred_cols)
|
|
250
|
+
if problems:
|
|
251
|
+
list_problems = cast(list[Problem], problems)
|
|
252
|
+
return ListSheetContentProblem(sheet.excel_name, sheet.sheet_name, list_problems)
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _check_for_erroneous_node_info_one_df(df: pd.DataFrame, columns: list[str]) -> list[NodesPerRowProblem]:
|
|
257
|
+
problems = []
|
|
258
|
+
for focus_col_index, col in enumerate(columns):
|
|
259
|
+
problems.extend(_check_for_erroneous_entries_one_column_level(df, columns, focus_col_index))
|
|
260
|
+
return problems
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _check_for_erroneous_entries_one_column_level(
|
|
264
|
+
df: pd.DataFrame, columns: list[str], focus_col_index: int
|
|
265
|
+
) -> list[NodesPerRowProblem]:
|
|
266
|
+
# column level refers to the hierarchical level of the nodes. eg. "en_1"
|
|
267
|
+
# we need to group by from the current column all the way back to its ancestors,
|
|
268
|
+
# otherwise identical values in that column may be interpreted as belonging to the same group
|
|
269
|
+
grouped = df.groupby(columns[: focus_col_index + 1])
|
|
270
|
+
problems = []
|
|
271
|
+
for name, group in grouped:
|
|
272
|
+
remaining_to_check_columns = columns[focus_col_index:]
|
|
273
|
+
problems.extend(_check_for_erroneous_entries_one_grouped_df(group, remaining_to_check_columns))
|
|
274
|
+
return problems
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _check_for_erroneous_entries_one_grouped_df(
|
|
278
|
+
group: pd.DataFrame, target_cols: list[str]
|
|
279
|
+
) -> list[NodesPerRowProblem]:
|
|
280
|
+
problems: list[NodesPerRowProblem] = []
|
|
281
|
+
first_col = min(group.index)
|
|
282
|
+
# The first row is the current parent node. The remaining columns in that row must be empty.
|
|
283
|
+
if not group.loc[first_col, target_cols[1:]].isna().all():
|
|
284
|
+
problems.append(NodesPerRowProblem(target_cols[1:], int(first_col), should_be_empty=True))
|
|
285
|
+
if not len(target_cols) > 1:
|
|
286
|
+
return problems
|
|
287
|
+
# The second column of the remaining rows must not be empty, as these are the child nodes of the first row.
|
|
288
|
+
remaining_rows_of_next_column = group.loc[group.index[1:], target_cols[1]]
|
|
289
|
+
if (missing := remaining_rows_of_next_column.isna()).any():
|
|
290
|
+
for i, row in group[1:][missing].iterrows():
|
|
291
|
+
problems.append(NodesPerRowProblem([target_cols[1]], int(str(i)), should_be_empty=False))
|
|
292
|
+
return problems
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections import Counter
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import regex
|
|
11
|
+
|
|
12
|
+
from dsp_tools.commands.excel2json.lists.compliance_checks import make_all_excel_compliance_checks
|
|
13
|
+
from dsp_tools.commands.excel2json.lists.models.deserialise import Columns
|
|
14
|
+
from dsp_tools.commands.excel2json.lists.models.deserialise import ExcelSheet
|
|
15
|
+
from dsp_tools.commands.excel2json.lists.models.serialise import ListNode
|
|
16
|
+
from dsp_tools.commands.excel2json.lists.models.serialise import ListRoot
|
|
17
|
+
from dsp_tools.commands.excel2json.lists.utils import get_column_info
|
|
18
|
+
from dsp_tools.commands.excel2json.lists.utils import get_columns_of_preferred_lang
|
|
19
|
+
from dsp_tools.commands.excel2json.lists.utils import get_lang_string_from_column_name
|
|
20
|
+
from dsp_tools.commands.excel2json.old_lists import validate_lists_section_with_schema
|
|
21
|
+
from dsp_tools.commands.excel2json.utils import add_optional_columns
|
|
22
|
+
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def excel2lists(
|
|
26
|
+
excelfolder: str | Path,
|
|
27
|
+
path_to_output_file: Optional[Path] = None,
|
|
28
|
+
) -> tuple[list[dict[str, Any]], bool]:
|
|
29
|
+
"""
|
|
30
|
+
Convert lists described in Excel files into a "lists" section that can be inserted into a JSON project file.
|
|
31
|
+
If path_to_output_file is not None, write the result into the output file.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
excelfolder: path to the folder containing the Excel file(s)
|
|
35
|
+
path_to_output_file: path to the file where the output JSON file will be saved
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
InputError: if there is a problem with the input data
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
|
|
42
|
+
"""
|
|
43
|
+
df_dict = _parse_files(excelfolder)
|
|
44
|
+
sheet_list = _prepare_sheets(df_dict)
|
|
45
|
+
|
|
46
|
+
finished_lists = _make_serialised_lists(sheet_list)
|
|
47
|
+
validate_lists_section_with_schema(lists_section=finished_lists)
|
|
48
|
+
|
|
49
|
+
if path_to_output_file:
|
|
50
|
+
with open(path_to_output_file, "w", encoding="utf-8") as fp:
|
|
51
|
+
json.dump(finished_lists, fp, indent=4, ensure_ascii=False)
|
|
52
|
+
print(f"lists section was created successfully and written to file '{path_to_output_file}'")
|
|
53
|
+
|
|
54
|
+
return finished_lists, True
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _parse_files(excelfolder: Path | str) -> dict[str, dict[str, pd.DataFrame]]:
|
|
58
|
+
file_names = [file for file in Path(excelfolder).glob("*list*.xlsx", case_sensitive=False) if _non_hidden(file)]
|
|
59
|
+
df_dict = {}
|
|
60
|
+
for file in file_names:
|
|
61
|
+
df_dict[str(file)] = read_and_clean_all_sheets(file)
|
|
62
|
+
return df_dict
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _prepare_sheets(df_dict: dict[str, dict[str, pd.DataFrame]]) -> list[ExcelSheet]:
|
|
66
|
+
all_sheets: list[ExcelSheet] = []
|
|
67
|
+
for file, sheets in df_dict.items():
|
|
68
|
+
all_sheets.extend(_prepare_one_sheet(df, file, sheet_name) for sheet_name, df in sheets.items())
|
|
69
|
+
make_all_excel_compliance_checks(all_sheets)
|
|
70
|
+
return _construct_ids(all_sheets)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _prepare_one_sheet(df: pd.DataFrame, filename: str, sheet_name: str) -> ExcelSheet:
|
|
74
|
+
columns = get_column_info(df.columns)
|
|
75
|
+
df = add_optional_columns(df, {"id (optional)"})
|
|
76
|
+
return ExcelSheet(excel_name=filename, sheet_name=sheet_name, col_info=columns, df=df)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _non_hidden(path: Path) -> bool:
|
|
80
|
+
return not regex.search(r"^(\.|~\$).+", path.name)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _construct_ids(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
|
|
84
|
+
all_sheets = []
|
|
85
|
+
for sheet in sheet_list:
|
|
86
|
+
df = _complete_id_one_df(sheet.df, sheet.col_info.preferred_lang)
|
|
87
|
+
all_sheets.append(
|
|
88
|
+
ExcelSheet(excel_name=sheet.excel_name, col_info=sheet.col_info, sheet_name=sheet.sheet_name, df=df)
|
|
89
|
+
)
|
|
90
|
+
all_sheets = _resolve_duplicate_ids_all_excels(all_sheets)
|
|
91
|
+
return _fill_parent_id_col_all_excels(all_sheets)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _fill_parent_id_col_all_excels(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
|
|
95
|
+
all_sheets = []
|
|
96
|
+
for sheet in sheet_list:
|
|
97
|
+
df = _fill_parent_id_col_one_df(sheet.df, sheet.col_info.preferred_lang)
|
|
98
|
+
all_sheets.append(
|
|
99
|
+
ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, col_info=sheet.col_info, df=df)
|
|
100
|
+
)
|
|
101
|
+
return all_sheets
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _fill_parent_id_col_one_df(df: pd.DataFrame, preferred_language: str) -> pd.DataFrame:
|
|
105
|
+
"""Create an extra column with the ID of the parent node."""
|
|
106
|
+
# To start, all rows get the ID of the list. These will be overwritten if the row has another parent.
|
|
107
|
+
df["parent_id"] = df.at[0, "id"]
|
|
108
|
+
columns = get_columns_of_preferred_lang(df.columns, preferred_language, r"\d+")
|
|
109
|
+
for num in range(len(columns)):
|
|
110
|
+
grouped = df.groupby(columns[: num + 1])
|
|
111
|
+
for name, group in grouped:
|
|
112
|
+
if group.shape[0] > 1:
|
|
113
|
+
# The first row already has the correct ID assigned
|
|
114
|
+
rest_index = list(group.index)[1:]
|
|
115
|
+
df.loc[rest_index, "parent_id"] = group.at[group.index[0], "id"]
|
|
116
|
+
return df
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _resolve_duplicate_ids_all_excels(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
|
|
120
|
+
ids = []
|
|
121
|
+
for sheet in sheet_list:
|
|
122
|
+
ids.extend(sheet.df["id"].tolist())
|
|
123
|
+
counter = Counter(ids)
|
|
124
|
+
if duplicate_ids := [item for item, count in counter.items() if count > 1]:
|
|
125
|
+
return _remove_duplicate_ids_in_all_excels(duplicate_ids, sheet_list)
|
|
126
|
+
return sheet_list
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _remove_duplicate_ids_in_all_excels(duplicate_ids: list[str], sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
|
|
130
|
+
all_sheets = []
|
|
131
|
+
for sheet in sheet_list:
|
|
132
|
+
df = sheet.df
|
|
133
|
+
for i, row in df.iterrows():
|
|
134
|
+
if row["id"] in duplicate_ids and pd.isna(row["id (optional)"]):
|
|
135
|
+
df.loc[i, "id"] = _construct_non_duplicate_id_string(row, sheet.col_info.preferred_lang) # type: ignore[index]
|
|
136
|
+
all_sheets.append(
|
|
137
|
+
ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, col_info=sheet.col_info, df=df)
|
|
138
|
+
)
|
|
139
|
+
return sheet_list
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _complete_id_one_df(df: pd.DataFrame, preferred_language: str) -> pd.DataFrame:
|
|
143
|
+
df = _create_auto_id_one_df(df, preferred_language)
|
|
144
|
+
df["id"] = df["id (optional)"].fillna(df["auto_id"])
|
|
145
|
+
df = _resolve_duplicate_ids_keep_custom_change_auto_id_one_df(df, preferred_language)
|
|
146
|
+
return df
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _resolve_duplicate_ids_keep_custom_change_auto_id_one_df(df: pd.DataFrame, preferred_language: str) -> pd.DataFrame:
|
|
150
|
+
"""If there are duplicates in the id column, the auto_id is changed, the custom ID remains the same."""
|
|
151
|
+
if (duplicate_filter := df["id"].duplicated(keep=False)).any():
|
|
152
|
+
for i in duplicate_filter.index[duplicate_filter]:
|
|
153
|
+
if pd.isna(df.loc[i, "id (optional)"]):
|
|
154
|
+
df.loc[i, "id"] = _construct_non_duplicate_id_string(df.loc[i], preferred_language)
|
|
155
|
+
return df
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _create_auto_id_one_df(df: pd.DataFrame, preferred_language: str) -> pd.DataFrame:
|
|
159
|
+
"""For every node without manual ID, take the label of the preferred language as ID."""
|
|
160
|
+
df["auto_id"] = pd.NA
|
|
161
|
+
if not df["id (optional)"].isna().any():
|
|
162
|
+
return df
|
|
163
|
+
if pd.isna(df.at[0, "id (optional)"]):
|
|
164
|
+
df.loc[0, "auto_id"] = df.at[0, f"{preferred_language}_list"]
|
|
165
|
+
column_names = sorted(get_columns_of_preferred_lang(df.columns, preferred_language, r"\d+"), reverse=True)
|
|
166
|
+
for i, row in df.iterrows():
|
|
167
|
+
if pd.isna(row["id (optional)"]):
|
|
168
|
+
for col in column_names:
|
|
169
|
+
if pd.notna(row[col]):
|
|
170
|
+
df.loc[i, "auto_id"] = row[col] # type: ignore[index]
|
|
171
|
+
break
|
|
172
|
+
df = _resolve_duplicate_ids_for_auto_id_one_df(df, preferred_language)
|
|
173
|
+
return df
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _resolve_duplicate_ids_for_auto_id_one_df(df: pd.DataFrame, preferred_language: str) -> pd.DataFrame:
|
|
177
|
+
"""In case the auto_id is not unique; both auto_ids get a new ID by joining the node names of all the ancestors."""
|
|
178
|
+
if (duplicate_filter := df["auto_id"].dropna().duplicated(keep=False)).any():
|
|
179
|
+
for i in duplicate_filter.index[duplicate_filter]:
|
|
180
|
+
df.loc[i, "auto_id"] = _construct_non_duplicate_id_string(df.loc[i], preferred_language)
|
|
181
|
+
return df
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _construct_non_duplicate_id_string(row: pd.Series[Any], preferred_language: str) -> str:
|
|
185
|
+
"""In case the node name is not unique; an ID is created by joining the node names of all the ancestors."""
|
|
186
|
+
column_names = get_columns_of_preferred_lang(row.index, preferred_language, r"\d+")
|
|
187
|
+
column_names.insert(0, f"{preferred_language}_list")
|
|
188
|
+
id_cols = [row[col] for col in column_names if pd.notna(row[col])]
|
|
189
|
+
return ":".join(id_cols)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _make_serialised_lists(sheet_list: list[ExcelSheet]) -> list[dict[str, Any]]:
|
|
193
|
+
all_lists: list[ListRoot] = []
|
|
194
|
+
for sheet in sheet_list:
|
|
195
|
+
all_lists.append(_make_one_list(sheet))
|
|
196
|
+
all_lists = sorted(all_lists, key=lambda x: x.id_)
|
|
197
|
+
return [list_.to_dict() for list_ in all_lists]
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _make_one_list(sheet: ExcelSheet) -> ListRoot:
|
|
201
|
+
node_dict = _make_list_nodes_from_df(sheet.df, sheet.col_info)
|
|
202
|
+
nodes_for_root = _add_nodes_to_parent(node_dict, str(sheet.df.at[0, "id"])) if node_dict else []
|
|
203
|
+
return ListRoot(
|
|
204
|
+
id_=str(sheet.df.at[0, "id"]),
|
|
205
|
+
labels=_get_lang_dict(sheet.df.iloc[0], sheet.col_info.list_cols),
|
|
206
|
+
nodes=nodes_for_root,
|
|
207
|
+
comments=_get_lang_dict(sheet.df.iloc[0], sheet.col_info.comment_cols),
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _add_nodes_to_parent(node_dict: dict[str, ListNode], list_id: str) -> list[ListNode]:
|
|
212
|
+
root_list = []
|
|
213
|
+
for _, node in node_dict.items():
|
|
214
|
+
if node.parent_id == list_id:
|
|
215
|
+
root_list.append(node)
|
|
216
|
+
else:
|
|
217
|
+
node_dict[node.parent_id].sub_nodes.append(node)
|
|
218
|
+
return root_list
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _make_list_nodes_from_df(df: pd.DataFrame, col_info: Columns) -> dict[str, ListNode]:
|
|
222
|
+
node_dict = {}
|
|
223
|
+
for i, row in df[1:].iterrows():
|
|
224
|
+
node = _make_one_node(row, col_info)
|
|
225
|
+
node_dict[node.id_] = node
|
|
226
|
+
return node_dict
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _make_one_node(row: pd.Series[Any], col_info: Columns) -> ListNode:
|
|
230
|
+
labels = {}
|
|
231
|
+
for col_group in col_info.node_cols:
|
|
232
|
+
if found := _get_lang_dict(row, col_group.columns):
|
|
233
|
+
labels = found
|
|
234
|
+
break
|
|
235
|
+
return ListNode(
|
|
236
|
+
id_=str(row["id"]),
|
|
237
|
+
labels=labels,
|
|
238
|
+
comments=_get_lang_dict(row, col_info.comment_cols),
|
|
239
|
+
parent_id=str(row["parent_id"]),
|
|
240
|
+
sub_nodes=[],
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _get_lang_dict(row: pd.Series[Any], columns: list[str]) -> dict[str, str]:
|
|
245
|
+
return {
|
|
246
|
+
lang: row[col] for col in columns if not (pd.isna(row[col])) and (lang := get_lang_string_from_column_name(col))
|
|
247
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class ExcelSheet:
|
|
10
|
+
excel_name: str
|
|
11
|
+
sheet_name: str
|
|
12
|
+
col_info: Columns
|
|
13
|
+
df: pd.DataFrame
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Columns:
|
|
18
|
+
preferred_lang: str
|
|
19
|
+
list_cols: list[str]
|
|
20
|
+
comment_cols: list[str]
|
|
21
|
+
node_cols: list[ColumnNodes]
|
|
22
|
+
|
|
23
|
+
def __post_init__(self) -> None:
|
|
24
|
+
self.node_cols = sorted(self.node_cols, key=lambda x: x.level_num, reverse=True)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ColumnNodes:
|
|
29
|
+
level_num: int
|
|
30
|
+
columns: list[str]
|