ds-caselaw-marklogic-api-client 28.0.0__tar.gz → 28.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ds-caselaw-marklogic-api-client might be problematic. Click here for more details.

Files changed (83) hide show
  1. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/PKG-INFO +3 -2
  2. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/pyproject.toml +3 -2
  3. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/Client.py +48 -0
  4. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/factories.py +1 -0
  5. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/identifier_resolution.py +43 -0
  6. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/documents/__init__.py +21 -0
  7. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/documents/xml.py +1 -1
  8. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/models/identifiers/__init__.py +151 -0
  9. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/models/identifiers/fclid.py +48 -0
  10. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/models/identifiers/neutral_citation.py +49 -0
  11. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/models/identifiers/unpacker.py +46 -0
  12. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/utilities/aws.py +2 -0
  13. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xml_helpers.py +2 -4
  14. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/xquery/get_next_document_sequence_number.xqy +14 -0
  15. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/xquery/get_property_as_node.xqy +9 -0
  16. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/xquery/resolve_from_identifier.xqy +17 -0
  17. ds_caselaw_marklogic_api_client-28.2.0/src/caselawclient/xquery/set_property_as_node.xqy +11 -0
  18. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery_type_dicts.py +19 -0
  19. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/LICENSE.md +0 -0
  20. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/README.md +0 -0
  21. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/__init__.py +0 -0
  22. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/client_helpers/__init__.py +0 -0
  23. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/client_helpers/search_helpers.py +0 -0
  24. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/content_hash.py +0 -0
  25. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/errors.py +0 -0
  26. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/__init__.py +0 -0
  27. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/documents/body.py +0 -0
  28. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/documents/exceptions.py +0 -0
  29. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/documents/statuses.py +0 -0
  30. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/documents/transforms/html.xsl +0 -0
  31. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/judgments.py +0 -0
  32. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/neutral_citation_mixin.py +0 -0
  33. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/press_summaries.py +0 -0
  34. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/utilities/__init__.py +0 -0
  35. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/utilities/dates.py +0 -0
  36. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/models/utilities/move.py +0 -0
  37. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/py.typed +0 -0
  38. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/responses/__init__.py +0 -0
  39. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/responses/search_response.py +0 -0
  40. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/responses/search_result.py +0 -0
  41. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/responses/xsl/search_match.xsl +0 -0
  42. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/search_parameters.py +0 -0
  43. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/break_judgment_checkout.xqy +0 -0
  44. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/checkin_judgment.xqy +0 -0
  45. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/checkout_judgment.xqy +0 -0
  46. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/copy_document.xqy +0 -0
  47. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/delete_judgment.xqy +0 -0
  48. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/document_collections.xqy +0 -0
  49. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/document_exists.xqy +0 -0
  50. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_combined_stats_table.xqy +0 -0
  51. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_components_for_document.xqy +0 -0
  52. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_highest_enrichment_version.xqy +0 -0
  53. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_highest_parser_version.xqy +0 -0
  54. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_judgment.xqy +0 -0
  55. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_judgment_checkout_status.xqy +0 -0
  56. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_judgment_version.xqy +0 -0
  57. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_last_modified.xqy +0 -0
  58. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy +0 -0
  59. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_pending_parse_for_version.xqy +0 -0
  60. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_properties_for_search_results.xqy +0 -0
  61. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_property.xqy +0 -0
  62. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_recently_enriched.xqy +0 -0
  63. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_recently_parsed.xqy +0 -0
  64. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_version_annotation.xqy +0 -0
  65. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/get_version_created.xqy +0 -0
  66. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/insert_document.xqy +0 -0
  67. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/list_judgment_versions.xqy +0 -0
  68. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/set_boolean_property.xqy +0 -0
  69. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/set_metadata_citation.xqy +0 -0
  70. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/set_metadata_court.xqy +0 -0
  71. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/set_metadata_jurisdiction.xqy +0 -0
  72. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/set_metadata_name.xqy +0 -0
  73. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/set_metadata_this_uri.xqy +0 -0
  74. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/set_metadata_work_expression_date.xqy +0 -0
  75. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/set_property.xqy +0 -0
  76. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/update_document.xqy +0 -0
  77. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/update_locked_judgment.xqy +0 -0
  78. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/user_has_privilege.xqy +0 -0
  79. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/user_has_role.xqy +0 -0
  80. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/validate_all_documents.xqy +0 -0
  81. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/validate_document.xqy +0 -0
  82. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/xslt.xqy +0 -0
  83. {ds_caselaw_marklogic_api_client-28.0.0 → ds_caselaw_marklogic_api_client-28.2.0}/src/caselawclient/xquery/xslt_transform.xqy +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ds-caselaw-marklogic-api-client
3
- Version: 28.0.0
3
+ Version: 28.2.0
4
4
  Summary: An API client for interacting with the underlying data in Find Caselaw.
5
5
  Home-page: https://github.com/nationalarchives/ds-caselaw-custom-api-client
6
6
  Keywords: national archives,caselaw
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3.9
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Requires-Dist: boto3 (>=1.26.112,<2.0.0)
14
- Requires-Dist: certifi (>=2024.8.30,<2024.9.0)
14
+ Requires-Dist: certifi (>=2024.12.14,<2024.13.0)
15
15
  Requires-Dist: charset-normalizer (>=3.0.0,<4.0.0)
16
16
  Requires-Dist: django-environ (>=0.11.0,<0.12.0)
17
17
  Requires-Dist: ds-caselaw-utils (>=2.0.0,<3.0.0)
@@ -25,6 +25,7 @@ Requires-Dist: pytz (>=2024.1,<2025.0)
25
25
  Requires-Dist: requests (>=2.28.2,<3.0.0)
26
26
  Requires-Dist: requests-toolbelt (>=0.10.1,<1.1.0)
27
27
  Requires-Dist: saxonche (>=12.5.0,<13.0.0)
28
+ Requires-Dist: sqids (>=0.5.0,<0.6.0)
28
29
  Requires-Dist: typing-extensions (>=4.7.1,<5.0.0)
29
30
  Description-Content-Type: text/markdown
30
31
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "ds-caselaw-marklogic-api-client"
3
- version = "28.0.0"
3
+ version = "28.2.0"
4
4
  description = "An API client for interacting with the underlying data in Find Caselaw."
5
5
  authors = ["The National Archives"]
6
6
  homepage = "https://github.com/nationalarchives/ds-caselaw-custom-api-client"
@@ -12,7 +12,7 @@ packages = [
12
12
 
13
13
  [tool.poetry.dependencies]
14
14
  python = "^3.9"
15
- certifi = ">=2024.8.30,<2024.9.0"
15
+ certifi = ">=2024.12.14,<2024.13.0"
16
16
  charset-normalizer = "^3.0.0"
17
17
  django-environ = "^0.11.0"
18
18
  idna = "^3.4"
@@ -28,6 +28,7 @@ mypy-boto3-sns = "^1.26.69"
28
28
  pytz = "^2024.1"
29
29
  python-dateutil = "^2.9.0-post.0"
30
30
  saxonche = "^12.5.0"
31
+ sqids = "^0.5.0"
31
32
 
32
33
  [tool.poetry.group.dev.dependencies]
33
34
  coverage = "^7.2.3"
@@ -13,12 +13,14 @@ from xml.etree.ElementTree import Element, ParseError, fromstring
13
13
  import environ
14
14
  import requests
15
15
  from ds_caselaw_utils.types import NeutralCitationString
16
+ from lxml import etree
16
17
  from requests.auth import HTTPBasicAuth
17
18
  from requests.structures import CaseInsensitiveDict
18
19
  from requests_toolbelt.multipart import decoder
19
20
 
20
21
  from caselawclient import xquery_type_dicts as query_dicts
21
22
  from caselawclient.client_helpers import VersionAnnotation
23
+ from caselawclient.identifier_resolution import IdentifierResolutions
22
24
  from caselawclient.models.documents import (
23
25
  DOCUMENT_COLLECTION_URI_JUDGMENT,
24
26
  DOCUMENT_COLLECTION_URI_PRESS_SUMMARY,
@@ -864,6 +866,17 @@ class MarklogicApiClient:
864
866
  }
865
867
  return self._eval_and_decode(vars, "get_property.xqy")
866
868
 
869
+ def get_property_as_node(self, judgment_uri: DocumentURIString, name: str) -> Optional[etree._Element]:
870
+ uri = self._format_uri_for_marklogic(judgment_uri)
871
+ vars: query_dicts.GetPropertyAsNodeDict = {
872
+ "uri": uri,
873
+ "name": name,
874
+ }
875
+ value = self._eval_and_decode(vars, "get_property_as_node.xqy")
876
+ if not value:
877
+ return None
878
+ return etree.fromstring(value)
879
+
867
880
  def get_version_annotation(self, judgment_uri: DocumentURIString) -> str:
868
881
  uri = self._format_uri_for_marklogic(judgment_uri)
869
882
  vars: query_dicts.GetVersionAnnotationDict = {
@@ -896,6 +909,22 @@ class MarklogicApiClient:
896
909
 
897
910
  return self._send_to_eval(vars, "set_property.xqy")
898
911
 
912
+ def set_property_as_node(
913
+ self,
914
+ judgment_uri: DocumentURIString,
915
+ name: str,
916
+ value: etree._Element,
917
+ ) -> requests.Response:
918
+ """Given a root node, set the value of the MarkLogic property for a document to the _contents_ of that root node. The root node itself is discarded."""
919
+ uri = self._format_uri_for_marklogic(judgment_uri)
920
+ vars: query_dicts.SetPropertyAsNodeDict = {
921
+ "uri": uri,
922
+ "value": etree.tostring(value).decode(),
923
+ "name": name,
924
+ }
925
+
926
+ return self._send_to_eval(vars, "set_property_as_node.xqy")
927
+
899
928
  def set_boolean_property(
900
929
  self,
901
930
  judgment_uri: DocumentURIString,
@@ -1173,3 +1202,22 @@ class MarklogicApiClient:
1173
1202
  )
1174
1203
 
1175
1204
  return results
1205
+
1206
+ def resolve_from_identifier(self, identifier_uri: str, published_only: bool = True) -> IdentifierResolutions:
1207
+ """Given a PUI/EUI url, look up the precomputed slug and return the
1208
+ MarkLogic document URIs which match that slug. Multiple returns should be anticipated"""
1209
+ vars: query_dicts.ResolveFromIdentifierDict = {
1210
+ "identifier_uri": DocumentURIString(identifier_uri),
1211
+ "published_only": int(published_only),
1212
+ }
1213
+ raw_results: list[str] = get_multipart_strings_from_marklogic_response(
1214
+ self._send_to_eval(
1215
+ vars,
1216
+ "resolve_from_identifier.xqy",
1217
+ ),
1218
+ )
1219
+ return IdentifierResolutions.from_marklogic_output(raw_results)
1220
+
1221
+ def get_next_document_sequence_number(self) -> int:
1222
+ """Increment the MarkLogic sequence number by one and return the value."""
1223
+ return int(self._eval_and_decode({}, "get_next_document_sequence_number.xqy"))
@@ -62,6 +62,7 @@ class DocumentFactory:
62
62
  if not api_client:
63
63
  api_client = Mock(spec=MarklogicApiClient)
64
64
  api_client.get_judgment_xml_bytestring.return_value = DEFAULT_DOCUMENT_BODY_XML.encode(encoding="utf-8")
65
+ api_client.get_property_as_node.return_value = None
65
66
 
66
67
  document = cls.target_class(uri, api_client=api_client)
67
68
  document.content_as_html = Mock(return_value=html) # type: ignore[method-assign]
@@ -0,0 +1,43 @@
1
+ import json
2
+ from typing import NamedTuple
3
+
4
+ from caselawclient.models.documents import DocumentURIString
5
+ from caselawclient.xquery_type_dicts import MarkLogicDocumentURIString
6
+
7
+
8
+ class IdentifierResolutions(list["IdentifierResolution"]):
9
+ """
10
+ A list of candidate MarkLogic documents which correspond to a Public UI uri
11
+
12
+ MarkLogic returns a list of dictionaries; IdentifierResolution handles a single dictionary
13
+ which corresponds to a single identifier to MarkLogic document mapping.
14
+
15
+ see `xquery/resolve_from_identifier.xqy` and `resolve_from_identifier` in `Client.py`
16
+ """
17
+
18
+ @staticmethod
19
+ def from_marklogic_output(table: list[str]) -> "IdentifierResolutions":
20
+ return IdentifierResolutions(list(IdentifierResolution.from_marklogic_output(row) for row in table))
21
+
22
+ def published(self) -> "IdentifierResolutions":
23
+ "Filter the list so that only published documents are returned"
24
+ return IdentifierResolutions(list(x for x in self if x.document_published))
25
+
26
+
27
+ class IdentifierResolution(NamedTuple):
28
+ """A single response from MarkLogic about a single identifier / document mapping"""
29
+
30
+ identifier_uuid: str
31
+ document_uri: MarkLogicDocumentURIString
32
+ identifier_slug: DocumentURIString
33
+ document_published: bool
34
+
35
+ @staticmethod
36
+ def from_marklogic_output(raw_row: str) -> "IdentifierResolution":
37
+ row = json.loads(raw_row)
38
+ return IdentifierResolution(
39
+ identifier_uuid=row["documents.compiled_url_slugs.identifier_uuid"],
40
+ document_uri=MarkLogicDocumentURIString(row["documents.compiled_url_slugs.document_uri"]),
41
+ identifier_slug=DocumentURIString(row["documents.compiled_url_slugs.identifier_slug"]),
42
+ document_published=row["documents.compiled_url_slugs.document_published"] == "true",
43
+ )
@@ -15,6 +15,8 @@ from caselawclient.errors import (
15
15
  NotSupportedOnVersion,
16
16
  OnlySupportedOnVersion,
17
17
  )
18
+ from caselawclient.models.identifiers.fclid import FindCaseLawIdentifier, FindCaseLawIdentifierSchema
19
+ from caselawclient.models.identifiers.unpacker import unpack_all_identifiers_from_etree
18
20
  from caselawclient.models.utilities import VersionsDict, extract_version, render_versions
19
21
  from caselawclient.models.utilities.aws import (
20
22
  ParserInstructionsDict,
@@ -146,6 +148,8 @@ class Document:
146
148
  )
147
149
  """ `Document.body` represents the body of the document itself, without any information such as version tracking or properties. """
148
150
 
151
+ self._initialise_identifiers()
152
+
149
153
  def __repr__(self) -> str:
150
154
  name = self.body.name or "un-named"
151
155
  return f"<{self.document_noun} {self.uri}: {name}>"
@@ -160,6 +164,12 @@ class Document:
160
164
  """There is a docx in S3 private bucket for this Document"""
161
165
  return check_docx_exists(self.uri)
162
166
 
167
+ def _initialise_identifiers(self) -> None:
168
+ """Load this document's identifiers from MarkLogic."""
169
+
170
+ identifiers_element_as_etree = self.api_client.get_property_as_node(self.uri, "identifiers")
171
+ self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree)
172
+
163
173
  @property
164
174
  def best_human_identifier(self) -> Optional[str]:
165
175
  """
@@ -423,6 +433,12 @@ class Document:
423
433
  if not self.is_publishable:
424
434
  raise CannotPublishUnpublishableDocument
425
435
 
436
+ ## If it doesn't already have one, get a new FCLID for this document and assign
437
+ if len(self.identifiers.of_type(FindCaseLawIdentifier)) < 1:
438
+ document_fclid = FindCaseLawIdentifierSchema.mint(self.api_client)
439
+ self.identifiers.add(document_fclid)
440
+ self.save_identifiers()
441
+
426
442
  publish_documents(uri_for_s3(self.uri))
427
443
  self.api_client.set_published(self.uri, True)
428
444
  announce_document_event(
@@ -521,6 +537,11 @@ class Document:
521
537
  """
522
538
  return self.docx_exists()
523
539
 
540
+ def save_identifiers(self) -> None:
541
+ """Save the current state of this Document's identifiers to MarkLogic."""
542
+ self.identifiers.validate()
543
+ self.api_client.set_property_as_node(self.uri, "identifiers", self.identifiers.as_etree)
544
+
524
545
  def __getattr__(self, name: str) -> Any:
525
546
  warnings.warn(f"{name} no longer exists on Document, using Document.body instead", DeprecationWarning)
526
547
  try:
@@ -17,7 +17,7 @@ class XML:
17
17
  :raises NonXMLDocumentError: This document is not valid XML
18
18
  """
19
19
  try:
20
- self.xml_as_tree: etree.Element = etree.fromstring(xml_bytestring)
20
+ self.xml_as_tree: etree._Element = etree.fromstring(xml_bytestring)
21
21
  except etree.XMLSyntaxError:
22
22
  raise NonXMLDocumentError
23
23
 
@@ -0,0 +1,151 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Optional, Union
3
+ from uuid import uuid4
4
+
5
+ from lxml import etree
6
+
7
+ IDENTIFIER_PACKABLE_ATTRIBUTES: list[str] = [
8
+ "uuid",
9
+ "value",
10
+ "url_slug",
11
+ ]
12
+
13
+ IDENTIFIER_UNPACKABLE_ATTRIBUTES: list[str] = [
14
+ "uuid",
15
+ "value",
16
+ ]
17
+
18
+
19
+ class InvalidIdentifierXMLRepresentationException(Exception):
20
+ pass
21
+
22
+
23
+ class UUIDMismatchError(Exception):
24
+ pass
25
+
26
+
27
+ class IdentifierSchema(ABC):
28
+ """
29
+ A base class which describes what an identifier schema should look like.
30
+ """
31
+
32
+ name: str
33
+ namespace: str
34
+
35
+ def __init_subclass__(cls: type["IdentifierSchema"], **kwargs: Any) -> None:
36
+ """Ensure that subclasses have the required attributes set."""
37
+ for required in (
38
+ "name",
39
+ "namespace",
40
+ ):
41
+ if not getattr(cls, required, False):
42
+ raise NotImplementedError(f"Can't instantiate IdentifierSchema without {required} attribute.")
43
+ super().__init_subclass__(**kwargs)
44
+
45
+ def __repr__(self) -> str:
46
+ return self.name
47
+
48
+ @classmethod
49
+ @abstractmethod
50
+ def validate_identifier(cls, value: str) -> bool:
51
+ """Check that any given identifier value is valid for this schema."""
52
+ pass
53
+
54
+ @classmethod
55
+ @abstractmethod
56
+ def compile_identifier_url_slug(cls, value: str) -> str:
57
+ """Convert an identifier into a precompiled URL slug."""
58
+ pass
59
+
60
+
61
+ class Identifier(ABC):
62
+ """A base class for subclasses representing a concrete identifier."""
63
+
64
+ schema: type[IdentifierSchema]
65
+
66
+ uuid: str
67
+ value: str
68
+
69
+ def __init_subclass__(cls: type["Identifier"], **kwargs: Any) -> None:
70
+ """Ensure that subclasses have the required attributes set."""
71
+ for required in ("schema",):
72
+ if not getattr(cls, required, False):
73
+ raise NotImplementedError(f"Can't instantiate Identifier without {required} attribute.")
74
+ super().__init_subclass__(**kwargs)
75
+
76
+ def __repr__(self) -> str:
77
+ return f"<{self.schema.name} {self.value}: {self.uuid}>"
78
+
79
+ def __init__(self, value: str, uuid: Optional[str] = None) -> None:
80
+ self.value = value
81
+ if uuid:
82
+ self.uuid = uuid
83
+ else:
84
+ self.uuid = "id-" + str(uuid4())
85
+
86
+ @property
87
+ def as_xml_tree(self) -> etree._Element:
88
+ """Convert this Identifier into a packed XML representation for storage."""
89
+ identifier_root = etree.Element("identifier")
90
+
91
+ namespace_attribute = etree.SubElement(identifier_root, "namespace")
92
+ namespace_attribute.text = self.schema.namespace
93
+
94
+ for attribute in IDENTIFIER_PACKABLE_ATTRIBUTES:
95
+ packed_attribute = etree.SubElement(identifier_root, attribute)
96
+ packed_attribute.text = getattr(self, attribute)
97
+
98
+ return identifier_root
99
+
100
+ @property
101
+ def url_slug(self) -> str:
102
+ return self.schema.compile_identifier_url_slug(self.value)
103
+
104
+ def same_as(self, other: "Identifier") -> bool:
105
+ "Is this the same as another identifier (in value and schema)?"
106
+ return self.value == other.value and self.schema == other.schema
107
+
108
+
109
+ class Identifiers(dict[str, Identifier]):
110
+ def validate(self) -> None:
111
+ for uuid, identifier in self.items():
112
+ if uuid != identifier.uuid:
113
+ msg = "Key of {identifier} in Identifiers is {uuid} not {identifier.uuid}"
114
+ raise UUIDMismatchError(msg)
115
+
116
+ def contains(self, other_identifier: Identifier) -> bool:
117
+ "Do the identifier's value and namespace already exist in this group?"
118
+ return any(other_identifier.same_as(identifier) for identifier in self.values())
119
+
120
+ def add(self, identifier: Identifier) -> None:
121
+ if not self.contains(identifier):
122
+ self[identifier.uuid] = identifier
123
+
124
+ def __delitem__(self, key: Union[Identifier, str]) -> None:
125
+ if isinstance(key, Identifier):
126
+ super().__delitem__(key.uuid)
127
+ else:
128
+ super().__delitem__(key)
129
+
130
+ def of_type(self, identifier_type: type[Identifier]) -> list[Identifier]:
131
+ """Return a list of all identifiers of a given type."""
132
+ uuids = self.keys()
133
+ return [self[uuid] for uuid in list(uuids) if isinstance(self[uuid], identifier_type)]
134
+
135
+ def delete_type(self, deleted_identifier_type: type[Identifier]) -> None:
136
+ "For when we want an identifier to be the only valid identifier of that type, delete the others first"
137
+ uuids = self.keys()
138
+ for uuid in list(uuids):
139
+ # we could use compare to .schema instead, which would have diffferent behaviour for subclasses
140
+ if isinstance(self[uuid], deleted_identifier_type):
141
+ del self[uuid]
142
+
143
+ @property
144
+ def as_etree(self) -> etree._Element:
145
+ """Return an etree representation of all the Document's identifiers."""
146
+ identifiers_root = etree.Element("identifiers")
147
+
148
+ for identifier in self.values():
149
+ identifiers_root.append(identifier.as_xml_tree)
150
+
151
+ return identifiers_root
@@ -0,0 +1,48 @@
1
+ import re
2
+ from typing import TYPE_CHECKING
3
+
4
+ from sqids import Sqids
5
+
6
+ from . import Identifier, IdentifierSchema
7
+
8
+ if TYPE_CHECKING:
9
+ from caselawclient.Client import MarklogicApiClient
10
+
11
+
12
+ VALID_FCLID_PATTERN = re.compile(r"^[bcdfghjkmnpqrstvwxyz23456789]{4,}$")
13
+
14
+ FCLID_MINIMUM_LENGTH = 8
15
+ FCLID_ALPHABET = "bcdfghjkmnpqrstvwxyz23456789"
16
+
17
+ sqids = Sqids(
18
+ min_length=FCLID_MINIMUM_LENGTH,
19
+ alphabet=FCLID_ALPHABET,
20
+ )
21
+
22
+
23
+ class FindCaseLawIdentifierSchema(IdentifierSchema):
24
+ """
25
+ Identifier schema describing a Find Case Law Identifier.
26
+ """
27
+
28
+ name = "Find Case Law Identifier"
29
+ namespace = "fclid"
30
+
31
+ @classmethod
32
+ def validate_identifier(cls, value: str) -> bool:
33
+ return bool(VALID_FCLID_PATTERN.match(value))
34
+
35
+ @classmethod
36
+ def compile_identifier_url_slug(cls, value: str) -> str:
37
+ return "tna." + value
38
+
39
+ @classmethod
40
+ def mint(cls, api_client: "MarklogicApiClient") -> "FindCaseLawIdentifier":
41
+ """Generate a totally new Find Case Law identifier."""
42
+ next_sequence_number = api_client.get_next_document_sequence_number()
43
+ new_identifier = sqids.encode([next_sequence_number])
44
+ return FindCaseLawIdentifier(value=new_identifier)
45
+
46
+
47
+ class FindCaseLawIdentifier(Identifier):
48
+ schema = FindCaseLawIdentifierSchema
@@ -0,0 +1,49 @@
1
+ import re
2
+
3
+ from ds_caselaw_utils import neutral_url
4
+ from ds_caselaw_utils.types import NeutralCitationString
5
+
6
+ from . import Identifier, IdentifierSchema
7
+
8
+ VALID_NCN_PATTERN = re.compile(r"(^\[([0-9]{4})\] ([a-zA-Z]+)(?: ([a-zA-Z]+))? ([0-9]+)(?: \(([a-zA-Z]+)\))?$)")
9
+ """
10
+ This is a catch-all pattern for anything which looks like a Neutral Citation, even if the court itself isn't valid. Checking that an NCN is plausibly correct is handled elsewhere.
11
+
12
+ This pattern also defines five capture groups to standardise how we interface with the elements:
13
+
14
+ - `0`: The year of the decision
15
+ - `1`: The court
16
+ - `2`: (Optionally) the jurisdiction or division, depending on the court
17
+ - `3`: The sequence number of the decision
18
+ - `4`: (Optionally) the jurisdiction or division, depending on the court
19
+
20
+ TODO: When these capture groups are being used in anger (eg to build URL slugs) you should go through and name the groups.
21
+ """
22
+
23
+
24
+ class NeutralCitationNumberSchema(IdentifierSchema):
25
+ """
26
+ Identifier schema describing a Neutral Citation Number.
27
+
28
+ https://www.iclr.co.uk/knowledge/case-law/neutral-citations/
29
+ """
30
+
31
+ name = "Neutral Citation Number"
32
+ namespace = "ukncn"
33
+
34
+ @classmethod
35
+ def validate_identifier(cls, value: str) -> bool:
36
+ return bool(VALID_NCN_PATTERN.match(value))
37
+
38
+ @classmethod
39
+ def compile_identifier_url_slug(cls, value: str) -> str:
40
+ ncn_based_uri_string = neutral_url(
41
+ NeutralCitationString(value)
42
+ ) # TODO: At some point this should move out of utils and into this class.
43
+ if not ncn_based_uri_string:
44
+ raise Exception(f"Unable to convert NCN {value} into NCN-based URL slug")
45
+ return ncn_based_uri_string
46
+
47
+
48
+ class NeutralCitationNumber(Identifier):
49
+ schema = NeutralCitationNumberSchema
@@ -0,0 +1,46 @@
1
+ from typing import Optional
2
+
3
+ from lxml import etree
4
+
5
+ from . import IDENTIFIER_UNPACKABLE_ATTRIBUTES, Identifier, Identifiers, InvalidIdentifierXMLRepresentationException
6
+ from .fclid import FindCaseLawIdentifier
7
+ from .neutral_citation import NeutralCitationNumber
8
+
9
+ IDENTIFIER_NAMESPACE_MAP: dict[str, type[Identifier]] = {
10
+ "fclid": FindCaseLawIdentifier,
11
+ "ukncn": NeutralCitationNumber,
12
+ }
13
+
14
+
15
+ def unpack_all_identifiers_from_etree(identifiers_etree: Optional[etree._Element]) -> Identifiers:
16
+ """This expects the entire <identifiers> tag, and unpacks all Identifiers inside it"""
17
+ identifiers = Identifiers()
18
+ if identifiers_etree is None:
19
+ return identifiers
20
+ for identifier_etree in identifiers_etree.findall("identifier"):
21
+ identifier = unpack_an_identifier_from_etree(identifier_etree)
22
+ identifiers.add(identifier)
23
+ return identifiers
24
+
25
+
26
+ def unpack_an_identifier_from_etree(identifier_xml: etree._Element) -> Identifier:
27
+ """Given an etree representation of a single identifier, unpack it into an appropriate instance of an Identifier."""
28
+
29
+ namespace_element = identifier_xml.find("namespace")
30
+
31
+ if namespace_element is None or not namespace_element.text:
32
+ raise InvalidIdentifierXMLRepresentationException(
33
+ "Identifer XML representation is not valid: namespace not present or empty"
34
+ )
35
+
36
+ kwargs: dict[str, str] = {}
37
+
38
+ for attribute in IDENTIFIER_UNPACKABLE_ATTRIBUTES:
39
+ element = identifier_xml.find(attribute)
40
+ if element is None or not element.text:
41
+ raise InvalidIdentifierXMLRepresentationException(
42
+ f"Identifer XML representation is not valid: {element} not present or empty"
43
+ )
44
+ kwargs[attribute] = element.text
45
+
46
+ return IDENTIFIER_NAMESPACE_MAP[namespace_element.text](**kwargs)
@@ -137,12 +137,14 @@ def publish_documents(uri: str) -> None:
137
137
  response = client.list_objects(Bucket=private_bucket, Prefix=uri)
138
138
 
139
139
  for result in response.get("Contents", []):
140
+ print(f"Contemplating copying {result!r}")
140
141
  key = str(result["Key"])
141
142
 
142
143
  if not key.endswith("parser.log") and not key.endswith(".tar.gz"):
143
144
  source: CopySourceTypeDef = {"Bucket": private_bucket, "Key": key}
144
145
  extra_args: dict[str, str] = {}
145
146
  try:
147
+ print(f"Copying {key!r} from {private_bucket!r} to {public_bucket!r}")
146
148
  client.copy(source, public_bucket, key, extra_args)
147
149
  except botocore.client.ClientError as e:
148
150
  logging.warning(
@@ -9,8 +9,7 @@ def get_xpath_match_string(
9
9
  namespaces: Optional[Dict[str, str]] = None,
10
10
  fallback: str = "",
11
11
  ) -> str:
12
- kwargs = {"namespaces": namespaces} if namespaces else {}
13
- return str((node.xpath(path, **kwargs) or [fallback])[0])
12
+ return str((node.xpath(path, namespaces=namespaces) or [fallback])[0])
14
13
 
15
14
 
16
15
  def get_xpath_match_strings(
@@ -18,5 +17,4 @@ def get_xpath_match_strings(
18
17
  path: str,
19
18
  namespaces: Optional[Dict[str, str]] = None,
20
19
  ) -> list[str]:
21
- kwargs = {"namespaces": namespaces} if namespaces else {}
22
- return [str(x) for x in node.xpath(path, **kwargs)]
20
+ return [str(x) for x in node.xpath(path, namespaces=namespaces)]
@@ -0,0 +1,14 @@
1
+ xquery version "1.0-ml";
2
+ declare option xdmp:transaction-mode "update";
3
+
4
+ let $_ := xdmp:set-transaction-mode("update")
5
+ let $state_doc := fn:doc("state.xml")
6
+ let $counter_node := $state_doc/state/document_counter
7
+
8
+ let $current_counter := $counter_node/text()
9
+ let $new_counter := fn:sum(($current_counter, 1))
10
+
11
+ let $_ := xdmp:node-replace($counter_node, <document_counter>{$new_counter}</document_counter>)
12
+ let $_ := xdmp:commit()
13
+
14
+ return $new_counter
@@ -0,0 +1,9 @@
1
+ xquery version "1.0-ml";
2
+
3
+ declare variable $uri as xs:string external;
4
+
5
+ declare variable $name as xs:string external;
6
+
7
+ let $prop := fn:QName("", $name)
8
+
9
+ return xdmp:document-get-properties($uri, $prop)
@@ -0,0 +1,17 @@
1
+ xquery version "1.0-ml";
2
+
3
+ declare namespace xdmp="http://marklogic.com/xdmp";
4
+ declare variable $identifier_uri as xs:string external;
5
+ declare variable $published_only as xs:int? external := 1;
6
+
7
+ let $published_query := if ($published_only) then " AND document_published = 'true'" else ""
8
+ let $query := "SELECT * from compiled_url_slugs WHERE (identifier_slug = @uri)" || $published_query
9
+
10
+ return xdmp:sql(
11
+ $query,
12
+ "map",
13
+ map:new((
14
+ map:entry("uri", $identifier_uri)
15
+ ))
16
+ )
17
+
@@ -0,0 +1,11 @@
1
+ xquery version "1.0-ml";
2
+
3
+ import module namespace dls = "http://marklogic.com/xdmp/dls" at "/MarkLogic/dls.xqy";
4
+
5
+ declare variable $uri as xs:string external;
6
+ declare variable $value as xs:string external;
7
+ declare variable $name as xs:string external;
8
+
9
+ let $props := ( element {$name} {xdmp:unquote($value)/*/*} )
10
+
11
+ return dls:document-set-property($uri, $props)
@@ -113,6 +113,12 @@ class GetPropertyDict(MarkLogicAPIDict):
113
113
  uri: MarkLogicDocumentURIString
114
114
 
115
115
 
116
+ # get_property_as_node.xqy
117
+ class GetPropertyAsNodeDict(MarkLogicAPIDict):
118
+ name: str
119
+ uri: MarkLogicDocumentURIString
120
+
121
+
116
122
  # get_version_annotation.xqy
117
123
  class GetVersionAnnotationDict(MarkLogicAPIDict):
118
124
  uri: MarkLogicDocumentURIString
@@ -135,6 +141,12 @@ class ListJudgmentVersionsDict(MarkLogicAPIDict):
135
141
  uri: MarkLogicDocumentURIString
136
142
 
137
143
 
144
+ # resolve_from_identifier.xqy
145
+ class ResolveFromIdentifierDict(MarkLogicAPIDict):
146
+ identifier_uri: DocumentURIString
147
+ published_only: Optional[int]
148
+
149
+
138
150
  # set_boolean_property.xqy
139
151
  class SetBooleanPropertyDict(MarkLogicAPIDict):
140
152
  name: str
@@ -187,6 +199,13 @@ class SetPropertyDict(MarkLogicAPIDict):
187
199
  value: str
188
200
 
189
201
 
202
+ # set_property_as_node.xqy
203
+ class SetPropertyAsNodeDict(MarkLogicAPIDict):
204
+ name: str
205
+ uri: MarkLogicDocumentURIString
206
+ value: str
207
+
208
+
190
209
  # update_document.xqy
191
210
  class UpdateDocumentDict(MarkLogicAPIDict):
192
211
  annotation: str