dsp-tools 0.9.13__py3-none-any.whl → 18.3.0.post13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (348) hide show
  1. dsp_tools/__init__.py +5 -0
  2. dsp_tools/cli/args.py +47 -0
  3. dsp_tools/cli/call_action.py +85 -0
  4. dsp_tools/cli/call_action_files_only.py +101 -0
  5. dsp_tools/cli/call_action_with_network.py +207 -0
  6. dsp_tools/cli/create_parsers.py +479 -0
  7. dsp_tools/cli/entry_point.py +322 -0
  8. dsp_tools/cli/utils.py +87 -0
  9. dsp_tools/clients/CLAUDE.md +420 -0
  10. dsp_tools/clients/authentication_client.py +14 -0
  11. dsp_tools/clients/authentication_client_live.py +66 -0
  12. dsp_tools/clients/connection.py +35 -0
  13. dsp_tools/clients/connection_live.py +233 -0
  14. dsp_tools/clients/fuseki_metrics.py +60 -0
  15. dsp_tools/clients/group_user_clients.py +35 -0
  16. dsp_tools/clients/group_user_clients_live.py +181 -0
  17. dsp_tools/clients/legal_info_client.py +23 -0
  18. dsp_tools/clients/legal_info_client_live.py +132 -0
  19. dsp_tools/clients/list_client.py +49 -0
  20. dsp_tools/clients/list_client_live.py +166 -0
  21. dsp_tools/clients/metadata_client.py +24 -0
  22. dsp_tools/clients/metadata_client_live.py +47 -0
  23. dsp_tools/clients/ontology_clients.py +49 -0
  24. dsp_tools/clients/ontology_create_client_live.py +166 -0
  25. dsp_tools/clients/ontology_get_client_live.py +80 -0
  26. dsp_tools/clients/permissions_client.py +68 -0
  27. dsp_tools/clients/project_client.py +16 -0
  28. dsp_tools/clients/project_client_live.py +66 -0
  29. dsp_tools/commands/create/communicate_problems.py +24 -0
  30. dsp_tools/commands/create/create.py +134 -0
  31. dsp_tools/commands/create/create_on_server/cardinalities.py +111 -0
  32. dsp_tools/commands/create/create_on_server/classes.py +99 -0
  33. dsp_tools/commands/create/create_on_server/complete_ontologies.py +116 -0
  34. dsp_tools/commands/create/create_on_server/default_permissions.py +134 -0
  35. dsp_tools/commands/create/create_on_server/group_users.py +165 -0
  36. dsp_tools/commands/create/create_on_server/lists.py +163 -0
  37. dsp_tools/commands/create/create_on_server/mappers.py +12 -0
  38. dsp_tools/commands/create/create_on_server/onto_utils.py +74 -0
  39. dsp_tools/commands/create/create_on_server/ontology.py +52 -0
  40. dsp_tools/commands/create/create_on_server/project.py +68 -0
  41. dsp_tools/commands/create/create_on_server/properties.py +119 -0
  42. dsp_tools/commands/create/exceptions.py +29 -0
  43. dsp_tools/commands/create/lists_only.py +66 -0
  44. dsp_tools/commands/create/models/create_problems.py +87 -0
  45. dsp_tools/commands/create/models/parsed_ontology.py +88 -0
  46. dsp_tools/commands/create/models/parsed_project.py +81 -0
  47. dsp_tools/commands/create/models/rdf_ontology.py +12 -0
  48. dsp_tools/commands/create/models/server_project_info.py +100 -0
  49. dsp_tools/commands/create/parsing/parse_lists.py +45 -0
  50. dsp_tools/commands/create/parsing/parse_ontology.py +243 -0
  51. dsp_tools/commands/create/parsing/parse_project.py +149 -0
  52. dsp_tools/commands/create/parsing/parsing_utils.py +40 -0
  53. dsp_tools/commands/create/project_validate.py +595 -0
  54. dsp_tools/commands/create/serialisation/ontology.py +119 -0
  55. dsp_tools/commands/create/serialisation/project.py +44 -0
  56. dsp_tools/commands/excel2json/CLAUDE.md +101 -0
  57. dsp_tools/commands/excel2json/json_header.py +321 -0
  58. dsp_tools/commands/excel2json/lists/__init__.py +0 -0
  59. dsp_tools/commands/excel2json/lists/compliance_checks.py +292 -0
  60. dsp_tools/commands/excel2json/lists/make_lists.py +247 -0
  61. dsp_tools/commands/excel2json/lists/models/__init__.py +0 -0
  62. dsp_tools/commands/excel2json/lists/models/deserialise.py +30 -0
  63. dsp_tools/commands/excel2json/lists/models/input_error.py +216 -0
  64. dsp_tools/commands/excel2json/lists/models/serialise.py +57 -0
  65. dsp_tools/commands/excel2json/lists/utils.py +81 -0
  66. dsp_tools/commands/excel2json/models/__init__.py +0 -0
  67. dsp_tools/commands/excel2json/models/input_error.py +416 -0
  68. dsp_tools/commands/excel2json/models/json_header.py +175 -0
  69. dsp_tools/commands/excel2json/models/list_node_name.py +16 -0
  70. dsp_tools/commands/excel2json/models/ontology.py +76 -0
  71. dsp_tools/commands/excel2json/old_lists.py +328 -0
  72. dsp_tools/commands/excel2json/project.py +280 -0
  73. dsp_tools/commands/excel2json/properties.py +370 -0
  74. dsp_tools/commands/excel2json/resources.py +336 -0
  75. dsp_tools/commands/excel2json/utils.py +352 -0
  76. dsp_tools/commands/excel2xml/__init__.py +7 -0
  77. dsp_tools/commands/excel2xml/excel2xml_cli.py +523 -0
  78. dsp_tools/commands/excel2xml/excel2xml_lib.py +1953 -0
  79. dsp_tools/commands/excel2xml/propertyelement.py +47 -0
  80. dsp_tools/commands/get/__init__.py +0 -0
  81. dsp_tools/commands/get/get.py +166 -0
  82. dsp_tools/commands/get/get_permissions.py +257 -0
  83. dsp_tools/commands/get/get_permissions_legacy.py +89 -0
  84. dsp_tools/commands/get/legacy_models/__init__.py +0 -0
  85. dsp_tools/commands/get/legacy_models/context.py +318 -0
  86. dsp_tools/commands/get/legacy_models/group.py +241 -0
  87. dsp_tools/commands/get/legacy_models/helpers.py +47 -0
  88. dsp_tools/commands/get/legacy_models/listnode.py +390 -0
  89. dsp_tools/commands/get/legacy_models/model.py +12 -0
  90. dsp_tools/commands/get/legacy_models/ontology.py +324 -0
  91. dsp_tools/commands/get/legacy_models/project.py +366 -0
  92. dsp_tools/commands/get/legacy_models/propertyclass.py +417 -0
  93. dsp_tools/commands/get/legacy_models/resourceclass.py +676 -0
  94. dsp_tools/commands/get/legacy_models/user.py +438 -0
  95. dsp_tools/commands/get/models/__init__.py +0 -0
  96. dsp_tools/commands/get/models/permissions_models.py +10 -0
  97. dsp_tools/commands/id2iri.py +258 -0
  98. dsp_tools/commands/ingest_xmlupload/__init__.py +0 -0
  99. dsp_tools/commands/ingest_xmlupload/bulk_ingest_client.py +178 -0
  100. dsp_tools/commands/ingest_xmlupload/create_resources/__init__.py +0 -0
  101. dsp_tools/commands/ingest_xmlupload/create_resources/apply_ingest_id.py +69 -0
  102. dsp_tools/commands/ingest_xmlupload/create_resources/upload_xml.py +166 -0
  103. dsp_tools/commands/ingest_xmlupload/create_resources/user_information.py +121 -0
  104. dsp_tools/commands/ingest_xmlupload/ingest_files/__init__.py +0 -0
  105. dsp_tools/commands/ingest_xmlupload/ingest_files/ingest_files.py +64 -0
  106. dsp_tools/commands/ingest_xmlupload/upload_files/__init__.py +0 -0
  107. dsp_tools/commands/ingest_xmlupload/upload_files/filechecker.py +20 -0
  108. dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py +57 -0
  109. dsp_tools/commands/ingest_xmlupload/upload_files/upload_failures.py +66 -0
  110. dsp_tools/commands/ingest_xmlupload/upload_files/upload_files.py +67 -0
  111. dsp_tools/commands/resume_xmlupload/__init__.py +0 -0
  112. dsp_tools/commands/resume_xmlupload/resume_xmlupload.py +96 -0
  113. dsp_tools/commands/start_stack.py +428 -0
  114. dsp_tools/commands/update_legal/CLAUDE.md +344 -0
  115. dsp_tools/commands/update_legal/__init__.py +0 -0
  116. dsp_tools/commands/update_legal/core.py +182 -0
  117. dsp_tools/commands/update_legal/csv_operations.py +135 -0
  118. dsp_tools/commands/update_legal/models.py +87 -0
  119. dsp_tools/commands/update_legal/xml_operations.py +247 -0
  120. dsp_tools/commands/validate_data/CLAUDE.md +159 -0
  121. dsp_tools/commands/validate_data/__init__.py +0 -0
  122. dsp_tools/commands/validate_data/constants.py +59 -0
  123. dsp_tools/commands/validate_data/mappers.py +143 -0
  124. dsp_tools/commands/validate_data/models/__init__.py +0 -0
  125. dsp_tools/commands/validate_data/models/api_responses.py +45 -0
  126. dsp_tools/commands/validate_data/models/input_problems.py +119 -0
  127. dsp_tools/commands/validate_data/models/rdf_like_data.py +117 -0
  128. dsp_tools/commands/validate_data/models/validation.py +106 -0
  129. dsp_tools/commands/validate_data/prepare_data/__init__.py +0 -0
  130. dsp_tools/commands/validate_data/prepare_data/get_rdf_like_data.py +296 -0
  131. dsp_tools/commands/validate_data/prepare_data/make_data_graph.py +91 -0
  132. dsp_tools/commands/validate_data/prepare_data/prepare_data.py +184 -0
  133. dsp_tools/commands/validate_data/process_validation_report/__init__.py +0 -0
  134. dsp_tools/commands/validate_data/process_validation_report/get_user_validation_message.py +358 -0
  135. dsp_tools/commands/validate_data/process_validation_report/query_validation_result.py +507 -0
  136. dsp_tools/commands/validate_data/process_validation_report/reformat_validation_results.py +150 -0
  137. dsp_tools/commands/validate_data/shacl_cli_validator.py +70 -0
  138. dsp_tools/commands/validate_data/sparql/__init__.py +0 -0
  139. dsp_tools/commands/validate_data/sparql/cardinality_shacl.py +209 -0
  140. dsp_tools/commands/validate_data/sparql/construct_shacl.py +92 -0
  141. dsp_tools/commands/validate_data/sparql/legal_info_shacl.py +36 -0
  142. dsp_tools/commands/validate_data/sparql/value_shacl.py +357 -0
  143. dsp_tools/commands/validate_data/utils.py +59 -0
  144. dsp_tools/commands/validate_data/validate_data.py +283 -0
  145. dsp_tools/commands/validate_data/validation/__init__.py +0 -0
  146. dsp_tools/commands/validate_data/validation/check_duplicate_files.py +55 -0
  147. dsp_tools/commands/validate_data/validation/check_for_unknown_classes.py +67 -0
  148. dsp_tools/commands/validate_data/validation/get_validation_report.py +94 -0
  149. dsp_tools/commands/validate_data/validation/validate_ontology.py +107 -0
  150. dsp_tools/commands/xmlupload/CLAUDE.md +292 -0
  151. dsp_tools/commands/xmlupload/__init__.py +0 -0
  152. dsp_tools/commands/xmlupload/iri_resolver.py +21 -0
  153. dsp_tools/commands/xmlupload/make_rdf_graph/__init__.py +0 -0
  154. dsp_tools/commands/xmlupload/make_rdf_graph/constants.py +63 -0
  155. dsp_tools/commands/xmlupload/make_rdf_graph/jsonld_utils.py +44 -0
  156. dsp_tools/commands/xmlupload/make_rdf_graph/make_file_value.py +77 -0
  157. dsp_tools/commands/xmlupload/make_rdf_graph/make_resource_and_values.py +114 -0
  158. dsp_tools/commands/xmlupload/make_rdf_graph/make_values.py +262 -0
  159. dsp_tools/commands/xmlupload/models/__init__.py +0 -0
  160. dsp_tools/commands/xmlupload/models/bitstream_info.py +18 -0
  161. dsp_tools/commands/xmlupload/models/formatted_text_value.py +10 -0
  162. dsp_tools/commands/xmlupload/models/ingest.py +143 -0
  163. dsp_tools/commands/xmlupload/models/input_problems.py +58 -0
  164. dsp_tools/commands/xmlupload/models/lookup_models.py +21 -0
  165. dsp_tools/commands/xmlupload/models/permission.py +45 -0
  166. dsp_tools/commands/xmlupload/models/permissions_parsed.py +93 -0
  167. dsp_tools/commands/xmlupload/models/processed/__init__.py +0 -0
  168. dsp_tools/commands/xmlupload/models/processed/file_values.py +29 -0
  169. dsp_tools/commands/xmlupload/models/processed/res.py +27 -0
  170. dsp_tools/commands/xmlupload/models/processed/values.py +101 -0
  171. dsp_tools/commands/xmlupload/models/rdf_models.py +26 -0
  172. dsp_tools/commands/xmlupload/models/upload_clients.py +14 -0
  173. dsp_tools/commands/xmlupload/models/upload_state.py +20 -0
  174. dsp_tools/commands/xmlupload/prepare_xml_input/__init__.py +0 -0
  175. dsp_tools/commands/xmlupload/prepare_xml_input/ark2iri.py +55 -0
  176. dsp_tools/commands/xmlupload/prepare_xml_input/get_processed_resources.py +252 -0
  177. dsp_tools/commands/xmlupload/prepare_xml_input/iiif_uri_validator.py +50 -0
  178. dsp_tools/commands/xmlupload/prepare_xml_input/list_client.py +120 -0
  179. dsp_tools/commands/xmlupload/prepare_xml_input/prepare_xml_input.py +67 -0
  180. dsp_tools/commands/xmlupload/prepare_xml_input/read_validate_xml_file.py +58 -0
  181. dsp_tools/commands/xmlupload/prepare_xml_input/transform_input_values.py +118 -0
  182. dsp_tools/commands/xmlupload/resource_create_client.py +25 -0
  183. dsp_tools/commands/xmlupload/richtext_id2iri.py +37 -0
  184. dsp_tools/commands/xmlupload/stash/__init__.py +0 -0
  185. dsp_tools/commands/xmlupload/stash/analyse_circular_reference_graph.py +236 -0
  186. dsp_tools/commands/xmlupload/stash/create_info_for_graph.py +53 -0
  187. dsp_tools/commands/xmlupload/stash/graph_models.py +87 -0
  188. dsp_tools/commands/xmlupload/stash/stash_circular_references.py +68 -0
  189. dsp_tools/commands/xmlupload/stash/stash_models.py +109 -0
  190. dsp_tools/commands/xmlupload/stash/upload_stashed_resptr_props.py +106 -0
  191. dsp_tools/commands/xmlupload/stash/upload_stashed_xml_texts.py +196 -0
  192. dsp_tools/commands/xmlupload/upload_config.py +76 -0
  193. dsp_tools/commands/xmlupload/write_diagnostic_info.py +27 -0
  194. dsp_tools/commands/xmlupload/xmlupload.py +516 -0
  195. dsp_tools/config/__init__.py +0 -0
  196. dsp_tools/config/logger_config.py +69 -0
  197. dsp_tools/config/warnings_config.py +32 -0
  198. dsp_tools/error/__init__.py +0 -0
  199. dsp_tools/error/custom_warnings.py +39 -0
  200. dsp_tools/error/exceptions.py +204 -0
  201. dsp_tools/error/problems.py +10 -0
  202. dsp_tools/error/xmllib_errors.py +20 -0
  203. dsp_tools/error/xmllib_warnings.py +54 -0
  204. dsp_tools/error/xmllib_warnings_util.py +159 -0
  205. dsp_tools/error/xsd_validation_error_msg.py +19 -0
  206. dsp_tools/legacy_models/__init__.py +0 -0
  207. dsp_tools/legacy_models/datetimestamp.py +81 -0
  208. dsp_tools/legacy_models/langstring.py +253 -0
  209. dsp_tools/legacy_models/projectContext.py +49 -0
  210. dsp_tools/py.typed +0 -0
  211. dsp_tools/resources/schema/data.xsd +648 -0
  212. dsp_tools/resources/schema/lists-only.json +72 -0
  213. dsp_tools/resources/schema/project.json +1258 -0
  214. dsp_tools/resources/schema/properties-only.json +874 -0
  215. dsp_tools/resources/schema/resources-only.json +140 -0
  216. dsp_tools/resources/start-stack/docker-compose.override-host.j2 +11 -0
  217. dsp_tools/resources/start-stack/docker-compose.override.yml +11 -0
  218. dsp_tools/resources/start-stack/docker-compose.yml +88 -0
  219. dsp_tools/resources/start-stack/dsp-app-config.json +45 -0
  220. dsp_tools/resources/start-stack/dsp-app-config.override-host.j2 +26 -0
  221. dsp_tools/resources/validate_data/api-shapes-resource-cardinalities.ttl +191 -0
  222. dsp_tools/resources/validate_data/api-shapes.ttl +804 -0
  223. dsp_tools/resources/validate_data/shacl-cli-image.yml +4 -0
  224. dsp_tools/resources/validate_data/validate-ontology.ttl +99 -0
  225. dsp_tools/utils/__init__.py +0 -0
  226. dsp_tools/utils/ansi_colors.py +32 -0
  227. dsp_tools/utils/data_formats/__init__.py +0 -0
  228. dsp_tools/utils/data_formats/date_util.py +166 -0
  229. dsp_tools/utils/data_formats/iri_util.py +30 -0
  230. dsp_tools/utils/data_formats/shared.py +81 -0
  231. dsp_tools/utils/data_formats/uri_util.py +76 -0
  232. dsp_tools/utils/fuseki_bloating.py +63 -0
  233. dsp_tools/utils/json_parsing.py +22 -0
  234. dsp_tools/utils/rdf_constants.py +42 -0
  235. dsp_tools/utils/rdflib_utils.py +10 -0
  236. dsp_tools/utils/replace_id_with_iri.py +66 -0
  237. dsp_tools/utils/request_utils.py +238 -0
  238. dsp_tools/utils/xml_parsing/__init__.py +0 -0
  239. dsp_tools/utils/xml_parsing/get_lookups.py +32 -0
  240. dsp_tools/utils/xml_parsing/get_parsed_resources.py +325 -0
  241. dsp_tools/utils/xml_parsing/models/__init__.py +0 -0
  242. dsp_tools/utils/xml_parsing/models/parsed_resource.py +76 -0
  243. dsp_tools/utils/xml_parsing/parse_clean_validate_xml.py +137 -0
  244. dsp_tools/xmllib/CLAUDE.md +302 -0
  245. dsp_tools/xmllib/__init__.py +49 -0
  246. dsp_tools/xmllib/general_functions.py +877 -0
  247. dsp_tools/xmllib/internal/__init__.py +0 -0
  248. dsp_tools/xmllib/internal/checkers.py +162 -0
  249. dsp_tools/xmllib/internal/circumvent_circular_imports.py +36 -0
  250. dsp_tools/xmllib/internal/constants.py +46 -0
  251. dsp_tools/xmllib/internal/input_converters.py +155 -0
  252. dsp_tools/xmllib/internal/serialise_file_value.py +57 -0
  253. dsp_tools/xmllib/internal/serialise_resource.py +177 -0
  254. dsp_tools/xmllib/internal/serialise_values.py +152 -0
  255. dsp_tools/xmllib/internal/type_aliases.py +11 -0
  256. dsp_tools/xmllib/models/__init__.py +0 -0
  257. dsp_tools/xmllib/models/config_options.py +28 -0
  258. dsp_tools/xmllib/models/date_formats.py +48 -0
  259. dsp_tools/xmllib/models/dsp_base_resources.py +1542 -0
  260. dsp_tools/xmllib/models/internal/__init__.py +0 -0
  261. dsp_tools/xmllib/models/internal/file_values.py +172 -0
  262. dsp_tools/xmllib/models/internal/geometry.py +162 -0
  263. dsp_tools/xmllib/models/internal/migration_metadata.py +55 -0
  264. dsp_tools/xmllib/models/internal/serialise_permissions.py +66 -0
  265. dsp_tools/xmllib/models/internal/values.py +342 -0
  266. dsp_tools/xmllib/models/licenses/__init__.py +0 -0
  267. dsp_tools/xmllib/models/licenses/other.py +59 -0
  268. dsp_tools/xmllib/models/licenses/recommended.py +107 -0
  269. dsp_tools/xmllib/models/permissions.py +41 -0
  270. dsp_tools/xmllib/models/res.py +1782 -0
  271. dsp_tools/xmllib/models/root.py +348 -0
  272. dsp_tools/xmllib/value_checkers.py +434 -0
  273. dsp_tools/xmllib/value_converters.py +777 -0
  274. dsp_tools-18.3.0.post13.dist-info/METADATA +90 -0
  275. dsp_tools-18.3.0.post13.dist-info/RECORD +286 -0
  276. dsp_tools-18.3.0.post13.dist-info/WHEEL +4 -0
  277. dsp_tools-18.3.0.post13.dist-info/entry_points.txt +3 -0
  278. dsp_tools-0.9.13.dist-info/LICENSE +0 -674
  279. dsp_tools-0.9.13.dist-info/METADATA +0 -144
  280. dsp_tools-0.9.13.dist-info/RECORD +0 -71
  281. dsp_tools-0.9.13.dist-info/WHEEL +0 -5
  282. dsp_tools-0.9.13.dist-info/entry_points.txt +0 -3
  283. dsp_tools-0.9.13.dist-info/top_level.txt +0 -1
  284. dsplib/models/connection.py +0 -272
  285. dsplib/models/group.py +0 -296
  286. dsplib/models/helpers.py +0 -505
  287. dsplib/models/langstring.py +0 -277
  288. dsplib/models/listnode.py +0 -578
  289. dsplib/models/model.py +0 -20
  290. dsplib/models/ontology.py +0 -448
  291. dsplib/models/permission.py +0 -112
  292. dsplib/models/project.py +0 -547
  293. dsplib/models/propertyclass.py +0 -505
  294. dsplib/models/resource.py +0 -366
  295. dsplib/models/resourceclass.py +0 -810
  296. dsplib/models/sipi.py +0 -30
  297. dsplib/models/user.py +0 -731
  298. dsplib/models/value.py +0 -1000
  299. dsplib/utils/knora-data-schema.xsd +0 -454
  300. dsplib/utils/knora-schema-lists.json +0 -83
  301. dsplib/utils/knora-schema.json +0 -434
  302. dsplib/utils/onto_commons.py +0 -24
  303. dsplib/utils/onto_create_lists.py +0 -73
  304. dsplib/utils/onto_create_ontology.py +0 -442
  305. dsplib/utils/onto_get.py +0 -58
  306. dsplib/utils/onto_validate.py +0 -33
  307. dsplib/utils/xml_upload.py +0 -539
  308. dsplib/widgets/doublepassword.py +0 -80
  309. knora/MLS-import-libraries.py +0 -84
  310. knora/dsp_tools.py +0 -96
  311. knora/dsplib/models/connection.py +0 -272
  312. knora/dsplib/models/group.py +0 -296
  313. knora/dsplib/models/helpers.py +0 -506
  314. knora/dsplib/models/langstring.py +0 -277
  315. knora/dsplib/models/listnode.py +0 -578
  316. knora/dsplib/models/model.py +0 -20
  317. knora/dsplib/models/ontology.py +0 -448
  318. knora/dsplib/models/permission.py +0 -112
  319. knora/dsplib/models/project.py +0 -583
  320. knora/dsplib/models/propertyclass.py +0 -505
  321. knora/dsplib/models/resource.py +0 -416
  322. knora/dsplib/models/resourceclass.py +0 -811
  323. knora/dsplib/models/sipi.py +0 -35
  324. knora/dsplib/models/user.py +0 -731
  325. knora/dsplib/models/value.py +0 -1000
  326. knora/dsplib/utils/knora-data-schema.xsd +0 -464
  327. knora/dsplib/utils/knora-schema-lists.json +0 -83
  328. knora/dsplib/utils/knora-schema.json +0 -444
  329. knora/dsplib/utils/onto_commons.py +0 -24
  330. knora/dsplib/utils/onto_create_lists.py +0 -73
  331. knora/dsplib/utils/onto_create_ontology.py +0 -451
  332. knora/dsplib/utils/onto_get.py +0 -58
  333. knora/dsplib/utils/onto_validate.py +0 -33
  334. knora/dsplib/utils/xml_upload.py +0 -540
  335. knora/dsplib/widgets/doublepassword.py +0 -80
  336. knora/knora.py +0 -2108
  337. knora/test.py +0 -99
  338. knora/testit.py +0 -76
  339. knora/xml2knora.py +0 -633
  340. {dsplib → dsp_tools/cli}/__init__.py +0 -0
  341. {dsplib/models → dsp_tools/clients}/__init__.py +0 -0
  342. {dsplib/utils → dsp_tools/commands}/__init__.py +0 -0
  343. {dsplib/widgets → dsp_tools/commands/create}/__init__.py +0 -0
  344. {knora → dsp_tools/commands/create/create_on_server}/__init__.py +0 -0
  345. {knora/dsplib → dsp_tools/commands/create/models}/__init__.py +0 -0
  346. {knora/dsplib/models → dsp_tools/commands/create/parsing}/__init__.py +0 -0
  347. {knora/dsplib/utils → dsp_tools/commands/create/serialisation}/__init__.py +0 -0
  348. {knora/dsplib/widgets → dsp_tools/commands/excel2json}/__init__.py +0 -0
@@ -0,0 +1,344 @@
1
+ # Update Legal Metadata Command
2
+
3
+ ## Purpose
4
+
5
+ The `update-legal` command converts legal metadata in XML files from the old format (text properties)
6
+ to the new format (bitstream attributes). This migration is necessary because:
7
+
8
+ - **Old format**: Legal metadata (authorship, copyright, license) stored as `<text-prop>` elements within resources
9
+ - **New format**: Legal metadata stored as attributes directly on `<bitstream>` or `<iiif-uri>` elements
10
+
11
+ This command automates the migration while handling validation, error correction, and authorship deduplication.
12
+
13
+ ## Command Usage
14
+
15
+ ```bash
16
+ dsp-tools update-legal \
17
+ --authorship_prop=":hasAuthor" \
18
+ --copyright_prop=":hasCopyright" \
19
+ --license_prop=":hasLicense" \
20
+ --authorship_default="Project Member" \
21
+ --copyright_default="University" \
22
+ --license_default="CC BY" \
23
+ --fixed_errors="data_legal_errors.csv" \
24
+ data.xml
25
+ ```
26
+
27
+ See [docs/special-workflows/update-legal.md](../../../../docs/special-workflows/update-legal.md) for user documentation.
28
+
29
+ ## Architecture Overview
30
+
31
+ ### Data Flow Pipeline
32
+
33
+ The command follows a multi-stage pipeline:
34
+
35
+ 1. **Parse & Validate**: Parse XML file and validate property names exist
36
+ 2. **Extract Metadata**: For each resource with media, extract legal metadata using priority system
37
+ 3. **Validation**: Check for missing/invalid values
38
+ 4. **Error Handling**: If errors exist, write CSV for manual correction
39
+ 5. **Update XML**: If no errors, apply metadata as attributes and write updated XML
40
+ 6. **Iteration**: User fixes CSV and reruns command until all errors resolved
41
+
42
+ ### Priority System
43
+
44
+ Metadata values are resolved using this priority order:
45
+
46
+ 1. **CSV corrections** (from `--fixed_errors` file)
47
+ 2. **XML properties** (extracted from text-prop elements)
48
+ 3. **Default values** (from `--*_default` flags)
49
+ 4. **None** (triggers validation error)
50
+
51
+ ## Module Responsibilities
52
+
53
+ ### [core.py](core.py)
54
+
55
+ Main orchestration and validation logic:
56
+
57
+ - `update_legal_metadata()`: Entry point that coordinates entire workflow
58
+ - `_validate_flags()`: Ensures property names exist in XML
59
+ - `_update_xml_tree()`: Iterates through resources, collects metadata once per resource, decides whether to apply changes
60
+ - `_has_problems()`: Checks if metadata contains FIXME markers or missing values
61
+ - `_update_counter()`: Tracks statistics for final report
62
+
63
+ Key patterns:
64
+
65
+ - Uses functional approach with pure helper functions
66
+ - Clear separation: collection (read-only) vs application (mutations)
67
+ - Single-pass metadata collection eliminates duplicate work
68
+ - Authorship deduplication via `auth_text_to_id` dictionary (maps authorship text to unique ID)
69
+
70
+ ### [models.py](models.py)
71
+
72
+ Data structures and configuration:
73
+
74
+ - `LegalProperties`: Configuration for XML property names (e.g., `:hasAuthor`)
75
+ - `LegalMetadata`: Represents legal metadata for a single resource (license, copyright, authorships)
76
+ - `LegalMetadataDefaults`: Default values with automatic license parsing
77
+ - `Problem`: Represents validation error for CSV export
78
+ - `UpdateCounter`: Statistics tracker for final report
79
+
80
+ Important notes:
81
+
82
+ - `LegalMetadataDefaults.__init__()` automatically validates and parses license strings
83
+ using `xmllib.find_license_in_string()`
84
+ - All dataclasses use frozen=True for immutability where appropriate
85
+
86
+ ### [csv_operations.py](csv_operations.py)
87
+
88
+ CSV I/O for error handling workflow:
89
+
90
+ - `ProblemAggregator`: Converts problems to DataFrame with dynamic authorship columns
91
+ - `read_corrections_csv()`: Parses CSV corrections into `dict[resource_id, LegalMetadata]`
92
+ - `write_problems_to_csv()`: Writes validation errors to CSV with helpful FIXME markers
93
+ - `is_fixme_value()`: Checks if value starts with "FIXME:" prefix
94
+
95
+ **CSV format:**
96
+
97
+ - Fixed columns: `file`, `resource_id`, `license`, `copyright`
98
+ - Dynamic columns: `authorship_1`, `authorship_2`, ... (as many as needed)
99
+ - Sorted by resource_id for easy navigation
100
+
101
+ **Error prevention:**
102
+
103
+ - Refuses to overwrite existing CSV unless `--fixed_errors` flag provided
104
+ - Helpful error message suggests correct flag usage
105
+
106
+ ### [xml_operations.py](xml_operations.py)
107
+
108
+ XML manipulation and metadata application:
109
+
110
+ - `collect_metadata()`: Pure function that collects metadata from CSV, XML, or defaults (read-only)
111
+ - `apply_metadata_to_resource()`: Applies metadata as attributes and removes old text properties (mutations)
112
+ - `_resolve_metadata_values()`: Implements priority system (CSV > XML > defaults)
113
+ - `_extract_license_from_xml()`: Extracts license and validates with `xmllib.find_license_in_string()`
114
+ - `_extract_copyright_from_xml()`: Extracts copyright, detects duplicates
115
+ - `_extract_authorships_from_xml()`: Collects all authorship values
116
+ - `_apply_metadata_to_element()`: Applies metadata as attributes on media element
117
+ - `_remove_text_properties()`: Removes old text-prop elements
118
+ - `add_authorship_definitions_to_xml()`: Creates `<authorship>` definitions at root level
119
+ - `write_final_xml()`: Writes updated XML with statistics
120
+
121
+ **Authorship deduplication:**
122
+
123
+ - Multiple resources can share same authorship (e.g., "Jane Doe, Alice Jenkins")
124
+ - `auth_text_to_id` dictionary tracks unique authorships and assigns sequential IDs
125
+ - Authorship definitions added to root as `<authorship id="authorship_0">` elements
126
+ - Media elements reference via `authorship-id="authorship_0"` attribute
127
+
128
+ **Multiple value detection:**
129
+
130
+ - If multiple copyright values found: returns `"FIXME: Multiple copyrights found. Choose one: ..."`
131
+ - If multiple license values found: returns `"FIXME: Multiple licenses found. Choose one: ..."`
132
+ - This triggers CSV export for manual resolution
133
+
134
+ ## Architectural Improvements
135
+
136
+ ### Separation of Collection and Application
137
+
138
+ The codebase follows a clear pattern separating read operations from write operations:
139
+
140
+ **Collection Phase (`collect_metadata()`):**
141
+
142
+ - Pure function with no side effects
143
+ - Reads from CSV, XML properties, and defaults
144
+ - Returns `LegalMetadata` object
145
+ - Can be called safely without modifying the XML tree
146
+ - Executes exactly once per resource
147
+
148
+ **Application Phase (`apply_metadata_to_resource()`):**
149
+
150
+ - Mutates the XML tree in-place
151
+ - Applies metadata as attributes on media elements
152
+ - Removes old text properties
153
+ - Manages authorship deduplication dictionary
154
+ - Only called for valid resources (no problems)
155
+
156
+ **Benefits:**
157
+
158
+ - **Performance**: Eliminates duplicate XPath queries (~50% reduction for valid resources)
159
+ - **Clarity**: Clear contract - collection is read-only, application mutates
160
+ - **Safety**: Impossible to accidentally mutate during problem detection
161
+ - **Testability**: Each phase can be tested independently
162
+
163
+ ## Key Algorithms
164
+
165
+ ### Authorship Deduplication
166
+
167
+ Problem: Multiple resources may share the same authorship (e.g., "Jane Doe, Alice Jenkins").
168
+
169
+ Solution:
170
+
171
+ 1. Maintain `auth_text_to_id: dict[str, int]` throughout tree traversal
172
+ 2. When applying authorship to media element:
173
+ - Join all authorship values with ", " separator
174
+ - Check if this text already has an ID
175
+ - If not, assign next sequential ID
176
+ - Add `authorship-id="authorship_{id}"` attribute to media element
177
+ 3. After tree traversal, create `<authorship>` definitions at root level
178
+ 4. Each definition contains `<author>` child elements
179
+
180
+ Result: Shared authorships stored once at root, referenced by multiple resources.
181
+
182
+ ### FIXME Value Detection
183
+
184
+ Problem: Need to distinguish between missing values and values that need manual correction.
185
+
186
+ Solution:
187
+
188
+ - Use "FIXME:" prefix for values that need correction
189
+ - `is_fixme_value()` checks for this prefix
190
+ - During extraction:
191
+ - Multiple values: `"FIXME: Multiple X found. Choose one: A, B"`
192
+ - Invalid license: `"FIXME: Invalid license: courtesy of museum"`
193
+ - During validation: FIXME values treated same as missing values
194
+ - During CSV reading: FIXME values converted back to None
195
+
196
+ Result: Clear distinction between "missing" and "needs correction" in CSV workflow.
197
+
198
+ ### License Parsing
199
+
200
+ Problem: License strings come in many formats (`CC BY`, `CC-BY-4.0`, `http://rdfh.ch/licenses/cc-by-4.0`).
201
+
202
+ Solution:
203
+
204
+ - Use `xmllib.find_license_in_string()` to parse license text into standardized License enum
205
+ - If parsing fails, return `"FIXME: Invalid license: {text}"`
206
+ - In defaults, parse license string during `__init__()` to fail fast if invalid
207
+
208
+ Result: All licenses normalized to standard IRIs.
209
+
210
+ ## Error Handling Strategy
211
+
212
+ ### Iterative CSV Correction Workflow
213
+
214
+ 1. **First run**: User provides property names and defaults
215
+ - Command extracts metadata using priority system
216
+ - Validates all values
217
+ - If errors found: writes CSV with FIXME markers
218
+ - No XML output created
219
+
220
+ 2. **Manual correction**: User fixes CSV
221
+ - Replaces FIXME markers with correct values
222
+ - Can add missing values
223
+ - Can choose between multiple values
224
+ - **Important**: Can modify ANY column (not just FIXME ones) - see "CSV Override Behavior" below
225
+
226
+ 3. **Second run**: User provides `--fixed_errors` flag
227
+ - Command loads corrections from CSV
228
+ - CSV corrections take highest priority
229
+ - Validates again
230
+ - If still errors: writes new CSV
231
+ - If no errors: writes updated XML
232
+
233
+ 4. **Repeat** until all errors resolved
234
+
235
+ ### Validation Rules
236
+
237
+ A resource has problems if:
238
+
239
+ - License is None or FIXME value
240
+ - Copyright is None or FIXME value
241
+ - Authorships is empty list or contains FIXME value
242
+
243
+ Important: A resource must have ALL THREE components valid to avoid CSV export.
244
+
245
+ ### CSV Override Behavior
246
+
247
+ **Critical implementation detail:**
248
+
249
+ When `--fixed_errors` is used, ALL non-None CSV values override XML properties and defaults for resources in that CSV.
250
+ This applies to every column, not just FIXME markers.
251
+
252
+ **Priority resolution in `_resolve_metadata_values()`:**
253
+
254
+ 1. If CSV has non-None value: use it (skip XML extraction and defaults)
255
+ 2. Else if XML has value: use it (skip defaults)
256
+ 3. Else if defaults provided: use them
257
+ 4. Else: None (triggers validation error)
258
+
259
+ **Note:** FIXME-prefixed values are converted to None during CSV reading, allowing fallback to XML/defaults.
260
+
261
+ ### Error Messages
262
+
263
+ User-facing error messages:
264
+
265
+ - Missing property: Caught early in `_validate_flags()` with clear message
266
+ - Existing CSV: Suggests using `--fixed_errors` flag
267
+ - Invalid license default: Raised during `LegalMetadataDefaults.__init__()`
268
+ - FIXME markers in CSV: Provide context about what needs fixing
269
+
270
+ ## Testing Considerations
271
+
272
+ ### Unit Testing
273
+
274
+ Test each function in isolation:
275
+
276
+ - **Extraction functions**: Test with various XML structures (missing, single, multiple values)
277
+ - **Priority resolution**: Test all combinations of CSV/XML/defaults
278
+ - **FIXME detection**: Test all FIXME marker formats
279
+ - **CSV operations**: Test round-trip (write problems → read corrections)
280
+ - **Authorship deduplication**: Test ID assignment and reuse
281
+
282
+ ### Integration Testing
283
+
284
+ Test file I/O and cross-module interactions:
285
+
286
+ - **Full workflow**: Input XML → CSV → corrected CSV → output XML
287
+ - **Property validation**: Missing properties raise correct error
288
+ - **Default values**: Applied when XML values missing
289
+ - **CSV overwrite protection**: Existing CSV prevents accidental overwrite
290
+
291
+ ### E2E Testing
292
+
293
+ Test realistic scenarios:
294
+
295
+ - **Simple case**: All metadata present and valid
296
+ - **Missing values**: Some resources missing authorship/copyright/license
297
+ - **Invalid licenses**: Test "courtesy" and other invalid formats
298
+ - **Multiple values**: Resources with multiple copyright/license values
299
+ - **Shared authorships**: Multiple resources with same authorship
300
+ - **Iterative correction**: Multiple runs with CSV corrections
301
+
302
+ ### Edge Cases
303
+
304
+ - Empty authorship text values (should be filtered out)
305
+ - Whitespace-only values (should be treated as empty)
306
+ - Resources without media elements (should be skipped)
307
+ - Both bitstream and iiif-uri present (first one used)
308
+ - Unicode in authorship names
309
+ - Very long authorship lists
310
+
311
+ ## Common Pitfalls
312
+
313
+ 1. **Forgetting `--fixed_errors` flag**: Command will refuse to overwrite existing CSV
314
+ 2. **Not providing any property flags**: Caught early with validation error
315
+ 3. **Property names with wrong namespace**: Caught early when no matches found
316
+ 4. **Leaving FIXME markers in CSV**: Treated as missing values, triggers new CSV
317
+ 5. **Invalid license default**: Fails during defaults initialization, not during execution
318
+
319
+ ## Performance Considerations
320
+
321
+ - **Single-pass tree traversal**: All resources processed in one iteration
322
+ - **In-memory CSV**: Entire corrections CSV loaded into memory as dictionary
323
+ - **lxml XPath**: Efficient XPath queries for property extraction
324
+ - **Authorship deduplication**: O(1) lookup via dictionary
325
+ - **CSV sorting**: Results sorted by resource_id for easier navigation
326
+
327
+ For typical XML files (thousands of resources), performance should be near-instantaneous.
328
+
329
+ ## Dependencies
330
+
331
+ - **lxml**: XML parsing and manipulation
332
+ - **pandas**: CSV I/O with proper column handling
333
+ - **xmllib**: License parsing utilities (`find_license_in_string()`)
334
+ - **dsp_tools.utils.xml_parsing**: XML parsing/validation utilities
335
+
336
+ ## Future Improvements
337
+
338
+ Possible enhancements:
339
+
340
+ - Batch processing: Process multiple XML files at once
341
+ - Auto-detection: Try to guess property names from XML structure
342
+ - Validation preview: Show what would be changed without modifying XML
343
+ - Undo functionality: Revert XML back to text properties
344
+ - License suggestions: Use fuzzy matching for invalid licenses
File without changes
@@ -0,0 +1,182 @@
1
+ from pathlib import Path
2
+
3
+ from lxml import etree
4
+
5
+ from dsp_tools.commands.update_legal.csv_operations import is_fixme_value
6
+ from dsp_tools.commands.update_legal.csv_operations import read_corrections_csv
7
+ from dsp_tools.commands.update_legal.csv_operations import write_problems_to_csv
8
+ from dsp_tools.commands.update_legal.models import Authorships
9
+ from dsp_tools.commands.update_legal.models import LegalMetadata
10
+ from dsp_tools.commands.update_legal.models import LegalMetadataDefaults
11
+ from dsp_tools.commands.update_legal.models import LegalProperties
12
+ from dsp_tools.commands.update_legal.models import Problem
13
+ from dsp_tools.commands.update_legal.models import UpdateCounter
14
+ from dsp_tools.commands.update_legal.xml_operations import add_authorship_definitions_to_xml
15
+ from dsp_tools.commands.update_legal.xml_operations import apply_metadata_to_resource
16
+ from dsp_tools.commands.update_legal.xml_operations import collect_metadata
17
+ from dsp_tools.commands.update_legal.xml_operations import write_updated_xml
18
+ from dsp_tools.error.exceptions import InputError
19
+ from dsp_tools.utils.xml_parsing.parse_clean_validate_xml import parse_xml_file
20
+ from dsp_tools.utils.xml_parsing.parse_clean_validate_xml import transform_into_localnames
21
+
22
+
23
+ def update_legal_metadata(
24
+ input_file: Path,
25
+ properties: LegalProperties,
26
+ defaults: LegalMetadataDefaults,
27
+ fixed_errors_file: Path | None = None,
28
+ treat_invalid_licenses_as_unknown: bool = False,
29
+ ) -> bool:
30
+ """
31
+ Update legal metadata in an XML file, converting text properties to bitstream attributes.
32
+
33
+ Args:
34
+ input_file: Path to the input XML file
35
+ properties: Configuration for property names to extract from XML
36
+ defaults: Default values to use when metadata is missing
37
+ fixed_errors_file: Path to CSV file with corrected values
38
+ treat_invalid_licenses_as_unknown: If True, invalid licenses are replaced with 'unknown'
39
+
40
+ Returns:
41
+ True if all legal metadata could be updated, False if CSV error file was created
42
+ """
43
+ csv_corrections = None
44
+ if fixed_errors_file:
45
+ csv_corrections = read_corrections_csv(fixed_errors_file)
46
+
47
+ root = parse_xml_file(input_file)
48
+ root = transform_into_localnames(root)
49
+ _validate_flags(root, properties)
50
+
51
+ root_updated, counter, problems = _update_xml_tree(
52
+ root=root,
53
+ properties=properties,
54
+ defaults=defaults,
55
+ csv_corrections=csv_corrections,
56
+ treat_invalid_licenses_as_unknown=treat_invalid_licenses_as_unknown,
57
+ )
58
+
59
+ if len(problems) == 0:
60
+ # Success - write fully updated XML with _updated suffix
61
+ write_updated_xml(input_file, root_updated, counter, partial=False)
62
+ return True
63
+ else:
64
+ # Partial update - write both CSV and partial XML
65
+ write_problems_to_csv(input_file, problems)
66
+ write_updated_xml(input_file, root_updated, counter, partial=True)
67
+ return False
68
+
69
+
70
+ def _validate_flags(root: etree._Element, properties: LegalProperties) -> None:
71
+ if not properties.has_any_property():
72
+ raise InputError("At least one property (authorship_prop, copyright_prop, license_prop) must be provided")
73
+ text_prop_names = {x for x in root.xpath("//text-prop/@name")}
74
+ inexisting_props = [
75
+ x
76
+ for x in [properties.authorship_prop, properties.copyright_prop, properties.license_prop]
77
+ if x and x not in text_prop_names
78
+ ]
79
+ if inexisting_props:
80
+ raise InputError(f"The following properties do not exist in the XML file: {', '.join(inexisting_props)}")
81
+
82
+
83
+ def _update_xml_tree(
84
+ root: etree._Element,
85
+ properties: LegalProperties,
86
+ defaults: LegalMetadataDefaults,
87
+ csv_corrections: dict[str, LegalMetadata] | None = None,
88
+ treat_invalid_licenses_as_unknown: bool = False,
89
+ ) -> tuple[etree._Element, UpdateCounter, list[Problem]]:
90
+ """
91
+ Update the XML tree with legal metadata, applying corrections and defaults.
92
+ Resources without problems are fully updated (metadata applied, text properties removed).
93
+ Resources with problems are left unchanged in the XML, but problems are collected for CSV output.
94
+
95
+ Args:
96
+ root: The XML root element
97
+ properties: Configuration for property names to extract from XML
98
+ defaults: Default values to use when metadata is missing
99
+ csv_corrections: Dictionary of corrections from CSV (or None)
100
+ treat_invalid_licenses_as_unknown: If True, invalid licenses are replaced with 'unknown'
101
+
102
+ Returns:
103
+ Tuple of (updated root element, counter of updated resources, list of problems)
104
+ """
105
+ auth_text_to_id: dict[Authorships, int] = {}
106
+ problems: list[Problem] = []
107
+ counter = UpdateCounter()
108
+
109
+ for res in root.iterchildren(tag="resource"):
110
+ if not (media_tag_candidates := res.xpath("bitstream|iiif-uri")):
111
+ continue
112
+
113
+ res_id = res.attrib["id"]
114
+ media_elem = media_tag_candidates[0]
115
+ csv_metadata = csv_corrections.get(res_id) if csv_corrections else None
116
+
117
+ metadata = collect_metadata(
118
+ res=res,
119
+ properties=properties,
120
+ defaults=defaults,
121
+ counter=counter,
122
+ csv_metadata=csv_metadata,
123
+ treat_invalid_licenses_as_unknown=treat_invalid_licenses_as_unknown,
124
+ )
125
+
126
+ if _has_problems(metadata):
127
+ authorships = sorted(x for x in metadata.authorships.elems if x) or ["FIXME: Authorship missing"]
128
+ problem = Problem(
129
+ file_or_iiif_uri=str(media_elem.text).strip(),
130
+ res_id=res_id,
131
+ license=metadata.license or "FIXME: License missing",
132
+ copyright=metadata.copyright or "FIXME: Copyright missing",
133
+ authorships=authorships,
134
+ )
135
+ problems.append(problem)
136
+ elif metadata.any():
137
+ apply_metadata_to_resource(
138
+ res=res,
139
+ media_elem=media_elem,
140
+ metadata=metadata,
141
+ properties=properties,
142
+ auth_text_to_id=auth_text_to_id,
143
+ )
144
+ _update_counter(counter, metadata)
145
+
146
+ if auth_text_to_id:
147
+ add_authorship_definitions_to_xml(root, auth_text_to_id)
148
+
149
+ return root, counter, problems
150
+
151
+
152
+ def _has_problems(metadata: LegalMetadata) -> bool:
153
+ """
154
+ Check if metadata has any missing or invalid fields that should be reported in CSV.
155
+
156
+ Args:
157
+ metadata: The legal metadata to check
158
+
159
+ Returns:
160
+ True if there are problems, False otherwise
161
+ """
162
+ has_license_problem = metadata.license is None or is_fixme_value(metadata.license)
163
+ has_copyright_problem = metadata.copyright is None or is_fixme_value(metadata.copyright)
164
+
165
+ if not any(x for x in metadata.authorships.elems if x):
166
+ has_authorship_problem = True
167
+ elif any(is_fixme_value(x) for x in metadata.authorships.elems):
168
+ has_authorship_problem = True
169
+ else:
170
+ has_authorship_problem = False
171
+
172
+ return has_license_problem or has_copyright_problem or has_authorship_problem
173
+
174
+
175
+ def _update_counter(counter: UpdateCounter, metadata: LegalMetadata) -> None:
176
+ counter.resources_updated += 1
177
+ if metadata.license:
178
+ counter.licenses_set += 1
179
+ if metadata.copyright:
180
+ counter.copyrights_set += 1
181
+ if metadata.authorships:
182
+ counter.authorships_set += 1
@@ -0,0 +1,135 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import regex
6
+
7
+ from dsp_tools.commands.update_legal.models import Authorships
8
+ from dsp_tools.commands.update_legal.models import LegalMetadata
9
+ from dsp_tools.commands.update_legal.models import Problem
10
+ from dsp_tools.error.exceptions import InputError
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class ProblemAggregator:
15
+ """Aggregates multiple problems and provides DataFrame export for CSV generation."""
16
+
17
+ problems: list[Problem]
18
+
19
+ def to_dataframe(self) -> pd.DataFrame:
20
+ """Convert problems to DataFrame for CSV export."""
21
+ problem_dicts = []
22
+ max_authorships = max((len(p.authorships) for p in self.problems), default=0)
23
+
24
+ for problem in self.problems:
25
+ row_dict = {
26
+ "file": problem.file_or_iiif_uri,
27
+ "resource_id": problem.res_id,
28
+ "license": problem.license,
29
+ "copyright": problem.copyright,
30
+ }
31
+
32
+ # Add authorship columns (authorship_1, authorship_2, etc.)
33
+ for i in range(1, max_authorships + 1):
34
+ auth_idx = i - 1
35
+ authorship_value = problem.authorships[auth_idx] if auth_idx < len(problem.authorships) else ""
36
+ row_dict[f"authorship_{i}"] = authorship_value
37
+
38
+ problem_dicts.append(row_dict)
39
+
40
+ df = pd.DataFrame.from_records(problem_dicts)
41
+ df = df.sort_values(by=["resource_id"])
42
+
43
+ # Ensure column order matches documentation
44
+ base_cols = ["file", "resource_id", "license", "copyright"]
45
+ auth_cols = [f"authorship_{i}" for i in range(1, max_authorships + 1)]
46
+ df = df[base_cols + auth_cols]
47
+
48
+ return df
49
+
50
+ def save_to_csv(self, input_file: Path) -> Path:
51
+ """
52
+ Save problems to CSV file.
53
+
54
+ Args:
55
+ input_file: The input XML file path, used to determine the output path
56
+
57
+ Returns:
58
+ Path to the created CSV file
59
+ """
60
+ output_path = input_file.parent / f"{input_file.stem}_legal_errors.csv"
61
+ i = 1
62
+ while output_path.exists():
63
+ stem_without_suffix = regex.sub(r"_\d+$", "", output_path.stem)
64
+ output_path = output_path.with_name(f"{stem_without_suffix}_{i}{output_path.suffix}")
65
+ i += 1
66
+ df = self.to_dataframe()
67
+ df.to_csv(output_path, index=False, mode="x")
68
+ return output_path
69
+
70
+
71
+ def read_corrections_csv(csv_path: Path) -> dict[str, LegalMetadata]:
72
+ """Read corrected legal metadata from a CSV file, and return a mapping from resource ID to LegalMetadata."""
73
+ df = pd.read_csv(csv_path)
74
+
75
+ # Validate required columns
76
+ required_cols = {"file", "resource_id", "license", "copyright"}
77
+ if not required_cols.issubset(df.columns):
78
+ missing = required_cols - set(df.columns)
79
+ msg = f"CSV file is missing required columns: {missing}"
80
+ raise InputError(msg)
81
+
82
+ corrections = {}
83
+ for _, row in df.iterrows():
84
+ res_id = str(row["resource_id"])
85
+
86
+ license_val = str(row["license"]) if pd.notna(row["license"]) else None
87
+ copyright_val = str(row["copyright"]) if pd.notna(row["copyright"]) else None
88
+
89
+ # Skip rows that still have FIXME markers (not yet corrected)
90
+ if is_fixme_value(license_val):
91
+ license_val = None
92
+ if is_fixme_value(copyright_val):
93
+ copyright_val = None
94
+
95
+ # Collect all authorship columns (authorship_1, authorship_2, etc.)
96
+ authorships = _collect_authorships_from_row(row, df.columns)
97
+
98
+ corrections[res_id] = LegalMetadata(
99
+ license=license_val,
100
+ copyright=copyright_val,
101
+ authorships=authorships,
102
+ )
103
+
104
+ return corrections
105
+
106
+
107
+ def _collect_authorships_from_row(row: pd.Series, df_columns: pd.Index) -> Authorships:
108
+ """
109
+ Collect all authorship values from a CSV row.
110
+
111
+ Returns:
112
+ List of authorship values (excluding FIXME markers)
113
+ """
114
+ authorships = []
115
+ i = 1
116
+ while f"authorship_{i}" in df_columns:
117
+ auth_val = row[f"authorship_{i}"]
118
+ if pd.notna(auth_val):
119
+ auth_str = str(auth_val)
120
+ if not is_fixme_value(auth_str):
121
+ authorships.append(auth_str)
122
+ i += 1
123
+ return Authorships.from_iterable(authorships)
124
+
125
+
126
+ def is_fixme_value(value: str | None) -> bool:
127
+ """Check if a value is a FIXME marker"""
128
+ return value is not None and value.startswith("FIXME:")
129
+
130
+
131
+ def write_problems_to_csv(input_file: Path, problems: list[Problem]) -> None:
132
+ aggregator = ProblemAggregator(problems)
133
+ csv_path = aggregator.save_to_csv(input_file)
134
+ print(f"\n⚠️ Legal metadata contains errors. Please fix them in the CSV file:\n {csv_path}")
135
+ print(f"\nAfter fixing the errors, rerun the command with:\n --fixed_errors={csv_path}")