ds-caselaw-marklogic-api-client 24.0.1__tar.gz → 26.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ds-caselaw-marklogic-api-client might be problematic. Click here for more details.
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/PKG-INFO +2 -2
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/pyproject.toml +18 -6
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/Client.py +28 -13
- ds_caselaw_marklogic_api_client-24.0.1/src/caselawclient/models/documents.py → ds_caselaw_marklogic_api_client-26.0.0/src/caselawclient/models/documents/__init__.py +27 -209
- ds_caselaw_marklogic_api_client-26.0.0/src/caselawclient/models/documents/body.py +142 -0
- ds_caselaw_marklogic_api_client-26.0.0/src/caselawclient/models/documents/exceptions.py +6 -0
- ds_caselaw_marklogic_api_client-26.0.0/src/caselawclient/models/documents/statuses.py +12 -0
- ds_caselaw_marklogic_api_client-26.0.0/src/caselawclient/models/documents/xml.py +43 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/models/judgments.py +1 -3
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/models/press_summaries.py +1 -3
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/models/utilities/aws.py +3 -2
- ds_caselaw_marklogic_api_client-24.0.1/src/caselawclient/xml_tools.py +0 -129
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/LICENSE.md +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/README.md +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/__init__.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/client_helpers/__init__.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/client_helpers/search_helpers.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/content_hash.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/errors.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/models/__init__.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/models/neutral_citation_mixin.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/models/utilities/__init__.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/models/utilities/dates.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/models/utilities/move.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/py.typed +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/responses/__init__.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/responses/search_response.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/responses/search_result.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/responses/xsl/search_match.xsl +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/search_parameters.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xml_helpers.py +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/break_judgment_checkout.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/checkin_judgment.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/checkout_judgment.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/copy_document.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/delete_judgment.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/document_collections.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/document_exists.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_combined_stats_table.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_components_for_document.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_highest_enrichment_version.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_highest_parser_version.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_judgment.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_judgment_checkout_status.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_judgment_version.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_last_modified.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_pending_parse_for_version.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_properties_for_search_results.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_property.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_version_annotation.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/get_version_created.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/insert_document.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/list_judgment_versions.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/set_boolean_property.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/set_metadata_citation.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/set_metadata_court.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/set_metadata_jurisdiction.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/set_metadata_name.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/set_metadata_this_uri.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/set_metadata_work_expression_date.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/set_property.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/update_document.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/update_locked_judgment.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/user_has_privilege.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/user_has_role.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/validate_all_documents.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/validate_document.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/xslt.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery/xslt_transform.xqy +0 -0
- {ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/src/caselawclient/xquery_type_dicts.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ds-caselaw-marklogic-api-client
|
|
3
|
-
Version:
|
|
3
|
+
Version: 26.0.0
|
|
4
4
|
Summary: An API client for interacting with the underlying data in Find Caselaw.
|
|
5
5
|
Home-page: https://github.com/nationalarchives/ds-caselaw-custom-api-client
|
|
6
6
|
Keywords: national archives,caselaw
|
|
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Requires-Dist: boto3 (>=1.26.112,<2.0.0)
|
|
14
|
-
Requires-Dist: certifi (>=2024.
|
|
14
|
+
Requires-Dist: certifi (>=2024.8.30,<2024.9.0)
|
|
15
15
|
Requires-Dist: charset-normalizer (>=3.0.0,<4.0.0)
|
|
16
16
|
Requires-Dist: django-environ (>=0.11.0,<0.12.0)
|
|
17
17
|
Requires-Dist: ds-caselaw-utils (>=1.4.1,<2.0.0)
|
{ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/pyproject.toml
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "ds-caselaw-marklogic-api-client"
|
|
3
|
-
version = "
|
|
3
|
+
version = "26.0.0"
|
|
4
4
|
description = "An API client for interacting with the underlying data in Find Caselaw."
|
|
5
5
|
authors = ["The National Archives"]
|
|
6
6
|
homepage = "https://github.com/nationalarchives/ds-caselaw-custom-api-client"
|
|
@@ -12,7 +12,7 @@ packages = [
|
|
|
12
12
|
|
|
13
13
|
[tool.poetry.dependencies]
|
|
14
14
|
python = "^3.9"
|
|
15
|
-
certifi = ">=2024.
|
|
15
|
+
certifi = ">=2024.8.30,<2024.9.0"
|
|
16
16
|
charset-normalizer = "^3.0.0"
|
|
17
17
|
django-environ = "^0.11.0"
|
|
18
18
|
idna = "^3.4"
|
|
@@ -42,6 +42,13 @@ optional = true
|
|
|
42
42
|
[tool.poetry.group.docs.dependencies]
|
|
43
43
|
pdoc = "^14.0.0"
|
|
44
44
|
|
|
45
|
+
|
|
46
|
+
[tool.commitizen]
|
|
47
|
+
name = "cz_conventional_commits"
|
|
48
|
+
tag_format = "v$version"
|
|
49
|
+
version_scheme = "semver2"
|
|
50
|
+
version_provider = "poetry"
|
|
51
|
+
update_changelog_on_bump = true
|
|
45
52
|
[build-system]
|
|
46
53
|
requires = ["poetry-core"]
|
|
47
54
|
build-backend = "poetry.core.masonry.api"
|
|
@@ -56,12 +63,17 @@ filterwarnings = ["ignore::DeprecationWarning"]
|
|
|
56
63
|
line-length = 120
|
|
57
64
|
|
|
58
65
|
[tool.ruff.lint]
|
|
59
|
-
ignore = ["E501", "G004", "PLR2004", "RUF005", "RUF012", "UP040"] #
|
|
60
|
-
extend-select = ["W", "
|
|
61
|
-
|
|
62
|
-
|
|
66
|
+
ignore = ["E501", "G004", "PLR2004", "RUF005", "RUF012", "UP040"] # longlines, fstrings in logs, magic values, consider not concat, mutable classbits, type instead of TypeAlias
|
|
67
|
+
extend-select = ["W", "I", "SLF"]
|
|
68
|
+
# extend-select = [ "B", "Q", "C90", "I", "UP", "YTT", "ASYNC", "S", "BLE", "A", "COM", "C4", "DTZ", "T10", "DJ", "EM", "EXE", "FA",
|
|
69
|
+
# "ISC", "ICN", "G", "INP", "PIE", "T20", "PYI", "PT", "Q", "RSE", "RET", "SLOT", "SIM", "TID", "TCH", "INT", "PTH",
|
|
70
|
+
# "FIX", "PGH", "PL", "TRY", "FLY", "PERF", "RUF"]
|
|
63
71
|
unfixable = ["ERA"]
|
|
64
72
|
|
|
73
|
+
[tool.ruff.lint.extend-per-file-ignores]
|
|
74
|
+
"tests/*" = ["S101"] # `assert` is fine in tests
|
|
75
|
+
"tests/client/test_client.py" = ["SLF001"] # TODO: This really shouldn't be the case, but it's not important to fix right now.
|
|
76
|
+
|
|
65
77
|
# things skipped:
|
|
66
78
|
# N: naming, possibly good
|
|
67
79
|
# D: docstrings missing throughout
|
|
@@ -8,7 +8,7 @@ from datetime import datetime, time, timedelta
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any, Optional, Type, Union
|
|
10
10
|
from xml.etree import ElementTree
|
|
11
|
-
from xml.etree.ElementTree import Element
|
|
11
|
+
from xml.etree.ElementTree import Element, ParseError, fromstring
|
|
12
12
|
|
|
13
13
|
import environ
|
|
14
14
|
import requests
|
|
@@ -34,7 +34,6 @@ from caselawclient.xquery_type_dicts import (
|
|
|
34
34
|
MarkLogicPrivilegeURIString,
|
|
35
35
|
)
|
|
36
36
|
|
|
37
|
-
from . import xml_tools
|
|
38
37
|
from .content_hash import validate_content_hash
|
|
39
38
|
from .errors import (
|
|
40
39
|
DocumentNotFoundError,
|
|
@@ -129,7 +128,7 @@ def get_single_string_from_marklogic_response(
|
|
|
129
128
|
# relies on "" being falsy.
|
|
130
129
|
return ""
|
|
131
130
|
|
|
132
|
-
|
|
131
|
+
if part_count > 1:
|
|
133
132
|
raise MultipartResponseLongerThanExpected(
|
|
134
133
|
f"Response returned {part_count} multipart items, expected 1",
|
|
135
134
|
)
|
|
@@ -148,7 +147,7 @@ def get_single_bytestring_from_marklogic_response(
|
|
|
148
147
|
# relies on "" being falsy.
|
|
149
148
|
return b""
|
|
150
149
|
|
|
151
|
-
|
|
150
|
+
if part_count > 1:
|
|
152
151
|
raise MultipartResponseLongerThanExpected(
|
|
153
152
|
f"Response returned {part_count} multipart items, expected 1",
|
|
154
153
|
)
|
|
@@ -231,12 +230,11 @@ class MarklogicApiClient:
|
|
|
231
230
|
|
|
232
231
|
if DOCUMENT_COLLECTION_URI_JUDGMENT in collections:
|
|
233
232
|
return Judgment
|
|
234
|
-
|
|
233
|
+
if DOCUMENT_COLLECTION_URI_PRESS_SUMMARY in collections:
|
|
235
234
|
return PressSummary
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
)
|
|
235
|
+
raise DocumentHasNoTypeCollection(
|
|
236
|
+
f"The document at URI {uri} is not part of a valid document type collection.",
|
|
237
|
+
)
|
|
240
238
|
|
|
241
239
|
def _get_error_code_class(self, error_code: str) -> Type[MarklogicAPIError]:
|
|
242
240
|
"""
|
|
@@ -251,6 +249,23 @@ class MarklogicApiClient:
|
|
|
251
249
|
def _path_to_request_url(self, path: str) -> str:
|
|
252
250
|
return f"{self.base_url}/{path.lstrip('/')}"
|
|
253
251
|
|
|
252
|
+
@classmethod
|
|
253
|
+
def _get_error_code(cls, content_as_xml: Optional[str]) -> str:
|
|
254
|
+
logging.warning(
|
|
255
|
+
"XMLTools is deprecated and will be removed in later versions. "
|
|
256
|
+
"Use methods from MarklogicApiClient.Client instead.",
|
|
257
|
+
)
|
|
258
|
+
if not content_as_xml:
|
|
259
|
+
return "Unknown error, Marklogic returned a null or empty response"
|
|
260
|
+
try:
|
|
261
|
+
xml = fromstring(content_as_xml)
|
|
262
|
+
return xml.find(
|
|
263
|
+
"message-code",
|
|
264
|
+
namespaces={"": "http://marklogic.com/xdmp/error"},
|
|
265
|
+
).text # type: ignore
|
|
266
|
+
except (ParseError, TypeError, AttributeError):
|
|
267
|
+
return "Unknown error, Marklogic returned a null or empty response"
|
|
268
|
+
|
|
254
269
|
def _raise_for_status(self, response: requests.Response) -> None:
|
|
255
270
|
try:
|
|
256
271
|
response.raise_for_status()
|
|
@@ -269,7 +284,8 @@ class MarklogicApiClient:
|
|
|
269
284
|
|
|
270
285
|
if new_error_class == self.default_http_error_class:
|
|
271
286
|
# Attempt to decode the error code from the response
|
|
272
|
-
|
|
287
|
+
|
|
288
|
+
error_code = self._get_error_code(response.content.decode("utf-8"))
|
|
273
289
|
|
|
274
290
|
new_error_class = self._get_error_code_class(error_code)
|
|
275
291
|
|
|
@@ -498,9 +514,8 @@ class MarklogicApiClient:
|
|
|
498
514
|
court, jurisdiction = re.split("\\s*/\\s*", content)
|
|
499
515
|
self.set_document_court(document_uri, court)
|
|
500
516
|
return self.set_document_jurisdiction(document_uri, jurisdiction)
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
return self.set_document_jurisdiction(document_uri, "")
|
|
517
|
+
self.set_document_court(document_uri, content)
|
|
518
|
+
return self.set_document_jurisdiction(document_uri, "")
|
|
504
519
|
|
|
505
520
|
def set_judgment_this_uri(
|
|
506
521
|
self,
|
|
@@ -1,27 +1,21 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import warnings
|
|
3
3
|
from functools import cached_property
|
|
4
|
-
from typing import TYPE_CHECKING, Any,
|
|
4
|
+
from typing import TYPE_CHECKING, Any, NewType, Optional
|
|
5
5
|
|
|
6
|
-
import pytz
|
|
7
6
|
from ds_caselaw_utils import courts
|
|
8
7
|
from ds_caselaw_utils.courts import CourtNotFoundException
|
|
9
|
-
from lxml import etree
|
|
10
8
|
from lxml import html as html_parser
|
|
11
9
|
from requests_toolbelt.multipart import decoder
|
|
12
10
|
|
|
13
|
-
from caselawclient.
|
|
14
|
-
from caselawclient.models.utilities.dates import parse_string_date_as_utc
|
|
15
|
-
|
|
16
|
-
from ..errors import (
|
|
11
|
+
from caselawclient.errors import (
|
|
17
12
|
DocumentNotFoundError,
|
|
18
13
|
GatewayTimeoutError,
|
|
19
14
|
NotSupportedOnVersion,
|
|
20
15
|
OnlySupportedOnVersion,
|
|
21
16
|
)
|
|
22
|
-
from
|
|
23
|
-
from .utilities import
|
|
24
|
-
from .utilities.aws import (
|
|
17
|
+
from caselawclient.models.utilities import VersionsDict, extract_version, render_versions
|
|
18
|
+
from caselawclient.models.utilities.aws import (
|
|
25
19
|
ParserInstructionsDict,
|
|
26
20
|
announce_document_event,
|
|
27
21
|
check_docx_exists,
|
|
@@ -34,31 +28,17 @@ from .utilities.aws import (
|
|
|
34
28
|
uri_for_s3,
|
|
35
29
|
)
|
|
36
30
|
|
|
37
|
-
|
|
38
|
-
|
|
31
|
+
from .body import DocumentBody
|
|
32
|
+
from .exceptions import CannotPublishUnpublishableDocument, DocumentNotSafeForDeletion
|
|
33
|
+
from .statuses import DOCUMENT_STATUS_HOLD, DOCUMENT_STATUS_IN_PROGRESS, DOCUMENT_STATUS_NEW, DOCUMENT_STATUS_PUBLISHED
|
|
39
34
|
|
|
40
|
-
|
|
41
|
-
pass
|
|
35
|
+
MINIMUM_ENRICHMENT_TIME = datetime.timedelta(minutes=20)
|
|
42
36
|
|
|
43
37
|
|
|
44
38
|
class GatewayTimeoutGettingHTMLWithQuery(RuntimeWarning):
|
|
45
39
|
pass
|
|
46
40
|
|
|
47
41
|
|
|
48
|
-
DOCUMENT_STATUS_HOLD = "On hold"
|
|
49
|
-
""" This document has been placed on hold to actively prevent publication. """
|
|
50
|
-
|
|
51
|
-
DOCUMENT_STATUS_PUBLISHED = "Published"
|
|
52
|
-
""" This document has been published and should be considered publicly visible. """
|
|
53
|
-
|
|
54
|
-
DOCUMENT_STATUS_IN_PROGRESS = "In progress"
|
|
55
|
-
""" This document has not been published or put on hold, and has been picked up by an editor and
|
|
56
|
-
should be progressing through the document pipeline. """
|
|
57
|
-
|
|
58
|
-
DOCUMENT_STATUS_NEW = "New"
|
|
59
|
-
""" This document isn't published, on hold, or assigned, and can be picked up by an editor in the future. """
|
|
60
|
-
|
|
61
|
-
|
|
62
42
|
DOCUMENT_COLLECTION_URI_JUDGMENT = "judgment"
|
|
63
43
|
DOCUMENT_COLLECTION_URI_PRESS_SUMMARY = "press-summary"
|
|
64
44
|
|
|
@@ -67,19 +47,6 @@ if TYPE_CHECKING:
|
|
|
67
47
|
|
|
68
48
|
|
|
69
49
|
DocumentURIString = NewType("DocumentURIString", str)
|
|
70
|
-
CourtIdentifierString = NewType("CourtIdentifierString", str)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class CannotPublishUnpublishableDocument(Exception):
|
|
74
|
-
"""A document which has failed publication safety checks in `Document.is_publishable` cannot be published."""
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class DocumentNotSafeForDeletion(Exception):
|
|
78
|
-
"""A document which is not safe for deletion cannot be deleted."""
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class NonXMLDocumentError(Exception):
|
|
82
|
-
"""A document cannot be parsed as XML."""
|
|
83
50
|
|
|
84
51
|
|
|
85
52
|
class Document:
|
|
@@ -96,7 +63,7 @@ class Document:
|
|
|
96
63
|
|
|
97
64
|
attributes_to_validate: list[tuple[str, bool, str]] = [
|
|
98
65
|
(
|
|
99
|
-
"
|
|
66
|
+
"is_failure",
|
|
100
67
|
False,
|
|
101
68
|
"This document failed to parse",
|
|
102
69
|
),
|
|
@@ -143,20 +110,18 @@ class Document:
|
|
|
143
110
|
|
|
144
111
|
:raises DocumentNotFoundError: The document does not exist within MarkLogic
|
|
145
112
|
"""
|
|
146
|
-
self.uri = DocumentURIString(uri.strip("/"))
|
|
147
|
-
self.api_client = api_client
|
|
113
|
+
self.uri: DocumentURIString = DocumentURIString(uri.strip("/"))
|
|
114
|
+
self.api_client: MarklogicApiClient = api_client
|
|
148
115
|
if not self.document_exists():
|
|
149
116
|
raise DocumentNotFoundError(f"Document {self.uri} does not exist")
|
|
150
117
|
|
|
151
|
-
self.
|
|
152
|
-
xml_bytestring=self.api_client.get_judgment_xml_bytestring(
|
|
153
|
-
self.uri,
|
|
154
|
-
show_unpublished=True,
|
|
155
|
-
),
|
|
118
|
+
self.body: DocumentBody = DocumentBody(
|
|
119
|
+
xml_bytestring=self.api_client.get_judgment_xml_bytestring(self.uri, show_unpublished=True),
|
|
156
120
|
)
|
|
121
|
+
""" `Document.body` represents the XML of the document itself, without any information such as version tracking or properties. """
|
|
157
122
|
|
|
158
123
|
def __repr__(self) -> str:
|
|
159
|
-
name = self.name or "un-named"
|
|
124
|
+
name = self.body.name or "un-named"
|
|
160
125
|
return f"<{self.document_noun} {self.uri}: {name}>"
|
|
161
126
|
|
|
162
127
|
def document_exists(self) -> bool:
|
|
@@ -186,106 +151,6 @@ class Document:
|
|
|
186
151
|
"""
|
|
187
152
|
return f"https://caselaw.nationalarchives.gov.uk/{self.uri}"
|
|
188
153
|
|
|
189
|
-
@cached_property
|
|
190
|
-
def name(self) -> str:
|
|
191
|
-
return self.xml.get_xpath_match_string(
|
|
192
|
-
"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRWork/akn:FRBRname/@value",
|
|
193
|
-
{"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"},
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
@cached_property
|
|
197
|
-
def court(self) -> str:
|
|
198
|
-
return self.xml.get_xpath_match_string(
|
|
199
|
-
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:court/text()",
|
|
200
|
-
{
|
|
201
|
-
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
|
|
202
|
-
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
|
|
203
|
-
},
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
@cached_property
|
|
207
|
-
def jurisdiction(self) -> str:
|
|
208
|
-
return self.xml.get_xpath_match_string(
|
|
209
|
-
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:jurisdiction/text()",
|
|
210
|
-
{
|
|
211
|
-
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
|
|
212
|
-
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
|
|
213
|
-
},
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
@property
|
|
217
|
-
def court_and_jurisdiction_identifier_string(self) -> CourtIdentifierString:
|
|
218
|
-
if self.jurisdiction != "":
|
|
219
|
-
return CourtIdentifierString("/".join((self.court, self.jurisdiction)))
|
|
220
|
-
else:
|
|
221
|
-
return CourtIdentifierString(self.court)
|
|
222
|
-
|
|
223
|
-
@cached_property
|
|
224
|
-
def document_date_as_string(self) -> str:
|
|
225
|
-
return self.xml.get_xpath_match_string(
|
|
226
|
-
"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRWork/akn:FRBRdate/@date",
|
|
227
|
-
{"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"},
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
@cached_property
|
|
231
|
-
def document_date_as_date(self) -> Optional[datetime.date]:
|
|
232
|
-
if not self.document_date_as_string:
|
|
233
|
-
return None
|
|
234
|
-
try:
|
|
235
|
-
return datetime.datetime.strptime(
|
|
236
|
-
self.document_date_as_string,
|
|
237
|
-
"%Y-%m-%d",
|
|
238
|
-
).date()
|
|
239
|
-
except ValueError:
|
|
240
|
-
warnings.warn(
|
|
241
|
-
f"Unparsable date encountered: {self.document_date_as_string}",
|
|
242
|
-
UnparsableDate,
|
|
243
|
-
)
|
|
244
|
-
return None
|
|
245
|
-
|
|
246
|
-
def get_manifestation_datetimes(
|
|
247
|
-
self,
|
|
248
|
-
name: Optional[str] = None,
|
|
249
|
-
) -> list[datetime.datetime]:
|
|
250
|
-
name_filter = f"[@name='{name}']" if name else ""
|
|
251
|
-
iso_datetimes = self.xml.get_xpath_match_strings(
|
|
252
|
-
"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRManifestation"
|
|
253
|
-
f"/akn:FRBRdate{name_filter}/@date",
|
|
254
|
-
{"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"},
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
return [parse_string_date_as_utc(event, pytz.UTC) for event in iso_datetimes]
|
|
258
|
-
|
|
259
|
-
def get_latest_manifestation_datetime(
|
|
260
|
-
self,
|
|
261
|
-
name: Optional[str] = None,
|
|
262
|
-
) -> Optional[datetime.datetime]:
|
|
263
|
-
events = self.get_manifestation_datetimes(name)
|
|
264
|
-
if not events:
|
|
265
|
-
return None
|
|
266
|
-
else:
|
|
267
|
-
return max(events)
|
|
268
|
-
|
|
269
|
-
def get_latest_manifestation_type(self) -> Optional[str]:
|
|
270
|
-
return max(
|
|
271
|
-
(
|
|
272
|
-
(type, time)
|
|
273
|
-
for type in ["transform", "tna-enriched"]
|
|
274
|
-
if (time := self.get_latest_manifestation_datetime(type))
|
|
275
|
-
),
|
|
276
|
-
key=lambda x: x[1],
|
|
277
|
-
)[0]
|
|
278
|
-
|
|
279
|
-
@cached_property
|
|
280
|
-
def transformation_datetime(self) -> Optional[datetime.datetime]:
|
|
281
|
-
"""When was this document successfully parsed or reparsed (date from XML)"""
|
|
282
|
-
return self.get_latest_manifestation_datetime("transform")
|
|
283
|
-
|
|
284
|
-
@cached_property
|
|
285
|
-
def enrichment_datetime(self) -> Optional[datetime.datetime]:
|
|
286
|
-
"""When was this document successfully enriched (date from XML)"""
|
|
287
|
-
return self.get_latest_manifestation_datetime("tna-enriched")
|
|
288
|
-
|
|
289
154
|
@cached_property
|
|
290
155
|
def is_published(self) -> bool:
|
|
291
156
|
return self.api_client.get_published(self.uri)
|
|
@@ -374,10 +239,6 @@ class Document:
|
|
|
374
239
|
"Is this document a potentially historic version of a document, or is it the main document itself?"
|
|
375
240
|
return extract_version(self.uri) != 0
|
|
376
241
|
|
|
377
|
-
@cached_property
|
|
378
|
-
def content_as_xml(self) -> str:
|
|
379
|
-
return self.xml.xml_as_string
|
|
380
|
-
|
|
381
242
|
def content_as_html(
|
|
382
243
|
self,
|
|
383
244
|
version_uri: Optional[DocumentURIString] = None,
|
|
@@ -404,8 +265,7 @@ class Document:
|
|
|
404
265
|
GatewayTimeoutGettingHTMLWithQuery,
|
|
405
266
|
)
|
|
406
267
|
return self.content_as_html(version_uri)
|
|
407
|
-
|
|
408
|
-
raise e
|
|
268
|
+
raise e
|
|
409
269
|
|
|
410
270
|
def number_of_mentions(self, query: str) -> int:
|
|
411
271
|
html = self.content_as_html(query=query)
|
|
@@ -421,7 +281,7 @@ class Document:
|
|
|
421
281
|
|
|
422
282
|
:return: `True` if this document is in a 'failure' state, otherwise `False`
|
|
423
283
|
"""
|
|
424
|
-
if self.failed_to_parse:
|
|
284
|
+
if self.body.failed_to_parse:
|
|
425
285
|
return True
|
|
426
286
|
return False
|
|
427
287
|
|
|
@@ -431,20 +291,9 @@ class Document:
|
|
|
431
291
|
return True
|
|
432
292
|
return False
|
|
433
293
|
|
|
434
|
-
@cached_property
|
|
435
|
-
def failed_to_parse(self) -> bool:
|
|
436
|
-
"""
|
|
437
|
-
Did this document entirely fail to parse?
|
|
438
|
-
|
|
439
|
-
:return: `True` if there was a complete parser failure, otherwise `False`
|
|
440
|
-
"""
|
|
441
|
-
if "error" in self.xml.root_element:
|
|
442
|
-
return True
|
|
443
|
-
return False
|
|
444
|
-
|
|
445
294
|
@cached_property
|
|
446
295
|
def has_name(self) -> bool:
|
|
447
|
-
if not self.name:
|
|
296
|
+
if not self.body.name:
|
|
448
297
|
return False
|
|
449
298
|
|
|
450
299
|
return True
|
|
@@ -453,7 +302,7 @@ class Document:
|
|
|
453
302
|
def has_valid_court(self) -> bool:
|
|
454
303
|
try:
|
|
455
304
|
return bool(
|
|
456
|
-
courts.get_by_code(self.court_and_jurisdiction_identifier_string),
|
|
305
|
+
courts.get_by_code(self.body.court_and_jurisdiction_identifier_string),
|
|
457
306
|
)
|
|
458
307
|
except CourtNotFoundException:
|
|
459
308
|
return False
|
|
@@ -534,7 +383,7 @@ class Document:
|
|
|
534
383
|
Has this document been enriched recently?
|
|
535
384
|
"""
|
|
536
385
|
|
|
537
|
-
last_enrichment = self.enrichment_datetime
|
|
386
|
+
last_enrichment = self.body.enrichment_datetime
|
|
538
387
|
if not last_enrichment:
|
|
539
388
|
return False
|
|
540
389
|
|
|
@@ -615,7 +464,11 @@ class Document:
|
|
|
615
464
|
self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat())
|
|
616
465
|
|
|
617
466
|
parser_type_noun = {"judgment": "judgment", "press summary": "pressSummary"}[self.document_noun]
|
|
618
|
-
checked_date =
|
|
467
|
+
checked_date: Optional[str] = (
|
|
468
|
+
self.body.document_date_as_date.isoformat()
|
|
469
|
+
if self.body.document_date_as_date and self.body.document_date_as_date > datetime.date(1001, 1, 1)
|
|
470
|
+
else None
|
|
471
|
+
)
|
|
619
472
|
|
|
620
473
|
# the keys of parser_instructions should exactly match the parser output
|
|
621
474
|
# in the *-metadata.json files by the parser. Whilst typically empty
|
|
@@ -624,9 +477,9 @@ class Document:
|
|
|
624
477
|
parser_instructions: ParserInstructionsDict = {
|
|
625
478
|
"documentType": parser_type_noun,
|
|
626
479
|
"metadata": {
|
|
627
|
-
"name": self.name or None,
|
|
480
|
+
"name": self.body.name or None,
|
|
628
481
|
"cite": self.best_human_identifier or None,
|
|
629
|
-
"court": self.court or None,
|
|
482
|
+
"court": self.body.court or None,
|
|
630
483
|
"date": checked_date,
|
|
631
484
|
"uri": self.uri,
|
|
632
485
|
},
|
|
@@ -656,38 +509,3 @@ class Document:
|
|
|
656
509
|
if self.docx_exists():
|
|
657
510
|
return True
|
|
658
511
|
return False
|
|
659
|
-
|
|
660
|
-
class XML:
|
|
661
|
-
"""
|
|
662
|
-
Represents the XML of a document, and should contain all methods for interacting with it.
|
|
663
|
-
"""
|
|
664
|
-
|
|
665
|
-
def __init__(self, xml_bytestring: bytes):
|
|
666
|
-
"""
|
|
667
|
-
:raises NonXMLDocumentError: This document is not valid XML
|
|
668
|
-
"""
|
|
669
|
-
try:
|
|
670
|
-
self.xml_as_tree: etree.Element = etree.fromstring(xml_bytestring)
|
|
671
|
-
except etree.XMLSyntaxError:
|
|
672
|
-
raise NonXMLDocumentError
|
|
673
|
-
|
|
674
|
-
@property
|
|
675
|
-
def xml_as_string(self) -> str:
|
|
676
|
-
"""
|
|
677
|
-
:return: A string representation of this document's XML tree.
|
|
678
|
-
"""
|
|
679
|
-
return str(etree.tostring(self.xml_as_tree).decode(encoding="utf-8"))
|
|
680
|
-
|
|
681
|
-
@property
|
|
682
|
-
def root_element(self) -> str:
|
|
683
|
-
return str(self.xml_as_tree.tag)
|
|
684
|
-
|
|
685
|
-
def get_xpath_match_string(self, xpath: str, namespaces: Dict[str, str]) -> str:
|
|
686
|
-
return get_xpath_match_string(self.xml_as_tree, xpath, namespaces)
|
|
687
|
-
|
|
688
|
-
def get_xpath_match_strings(
|
|
689
|
-
self,
|
|
690
|
-
xpath: str,
|
|
691
|
-
namespaces: Dict[str, str],
|
|
692
|
-
) -> list[str]:
|
|
693
|
-
return get_xpath_match_strings(self.xml_as_tree, xpath, namespaces)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import warnings
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
from typing import NewType, Optional
|
|
5
|
+
|
|
6
|
+
import pytz
|
|
7
|
+
|
|
8
|
+
from caselawclient.models.utilities.dates import parse_string_date_as_utc
|
|
9
|
+
|
|
10
|
+
from .xml import XML
|
|
11
|
+
|
|
12
|
+
CourtIdentifierString = NewType("CourtIdentifierString", str)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UnparsableDate(Warning):
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DocumentBody:
|
|
20
|
+
"""
|
|
21
|
+
A class for abstracting out interactions with the body of a document.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, xml_bytestring: bytes):
|
|
25
|
+
self._xml = XML(xml_bytestring=xml_bytestring)
|
|
26
|
+
""" This is an instance of the `Document.XML` class for manipulation of the XML document itself. """
|
|
27
|
+
|
|
28
|
+
def get_xpath_match_string(self, xpath: str, namespaces: dict[str, str]) -> str:
|
|
29
|
+
return self._xml.get_xpath_match_string(xpath, namespaces)
|
|
30
|
+
|
|
31
|
+
@cached_property
|
|
32
|
+
def name(self) -> str:
|
|
33
|
+
return self._xml.get_xpath_match_string(
|
|
34
|
+
"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRWork/akn:FRBRname/@value",
|
|
35
|
+
{"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"},
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
@cached_property
|
|
39
|
+
def court(self) -> str:
|
|
40
|
+
return self._xml.get_xpath_match_string(
|
|
41
|
+
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:court/text()",
|
|
42
|
+
{
|
|
43
|
+
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
|
|
44
|
+
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
|
|
45
|
+
},
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
@cached_property
|
|
49
|
+
def jurisdiction(self) -> str:
|
|
50
|
+
return self._xml.get_xpath_match_string(
|
|
51
|
+
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:jurisdiction/text()",
|
|
52
|
+
{
|
|
53
|
+
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
|
|
54
|
+
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
|
|
55
|
+
},
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def court_and_jurisdiction_identifier_string(self) -> CourtIdentifierString:
|
|
60
|
+
if self.jurisdiction != "":
|
|
61
|
+
return CourtIdentifierString("/".join((self.court, self.jurisdiction)))
|
|
62
|
+
return CourtIdentifierString(self.court)
|
|
63
|
+
|
|
64
|
+
@cached_property
|
|
65
|
+
def document_date_as_string(self) -> str:
|
|
66
|
+
return self._xml.get_xpath_match_string(
|
|
67
|
+
"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRWork/akn:FRBRdate/@date",
|
|
68
|
+
{"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
@cached_property
|
|
72
|
+
def document_date_as_date(self) -> Optional[datetime.date]:
|
|
73
|
+
if not self.document_date_as_string:
|
|
74
|
+
return None
|
|
75
|
+
try:
|
|
76
|
+
return datetime.datetime.strptime(
|
|
77
|
+
self.document_date_as_string,
|
|
78
|
+
"%Y-%m-%d",
|
|
79
|
+
).date()
|
|
80
|
+
except ValueError:
|
|
81
|
+
warnings.warn(
|
|
82
|
+
f"Unparsable date encountered: {self.document_date_as_string}",
|
|
83
|
+
UnparsableDate,
|
|
84
|
+
)
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def get_manifestation_datetimes(
|
|
88
|
+
self,
|
|
89
|
+
name: Optional[str] = None,
|
|
90
|
+
) -> list[datetime.datetime]:
|
|
91
|
+
name_filter = f"[@name='{name}']" if name else ""
|
|
92
|
+
iso_datetimes = self._xml.get_xpath_match_strings(
|
|
93
|
+
"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRManifestation"
|
|
94
|
+
f"/akn:FRBRdate{name_filter}/@date",
|
|
95
|
+
{"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return [parse_string_date_as_utc(event, pytz.UTC) for event in iso_datetimes]
|
|
99
|
+
|
|
100
|
+
def get_latest_manifestation_datetime(
|
|
101
|
+
self,
|
|
102
|
+
name: Optional[str] = None,
|
|
103
|
+
) -> Optional[datetime.datetime]:
|
|
104
|
+
events = self.get_manifestation_datetimes(name)
|
|
105
|
+
if not events:
|
|
106
|
+
return None
|
|
107
|
+
return max(events)
|
|
108
|
+
|
|
109
|
+
def get_latest_manifestation_type(self) -> Optional[str]:
|
|
110
|
+
return max(
|
|
111
|
+
(
|
|
112
|
+
(type, time)
|
|
113
|
+
for type in ["transform", "tna-enriched"]
|
|
114
|
+
if (time := self.get_latest_manifestation_datetime(type))
|
|
115
|
+
),
|
|
116
|
+
key=lambda x: x[1],
|
|
117
|
+
)[0]
|
|
118
|
+
|
|
119
|
+
@cached_property
|
|
120
|
+
def transformation_datetime(self) -> Optional[datetime.datetime]:
|
|
121
|
+
"""When was this document successfully parsed or reparsed (date from XML)"""
|
|
122
|
+
return self.get_latest_manifestation_datetime("transform")
|
|
123
|
+
|
|
124
|
+
@cached_property
|
|
125
|
+
def enrichment_datetime(self) -> Optional[datetime.datetime]:
|
|
126
|
+
"""When was this document successfully enriched (date from XML)"""
|
|
127
|
+
return self.get_latest_manifestation_datetime("tna-enriched")
|
|
128
|
+
|
|
129
|
+
@cached_property
|
|
130
|
+
def content_as_xml(self) -> str:
|
|
131
|
+
return self._xml.xml_as_string
|
|
132
|
+
|
|
133
|
+
@cached_property
|
|
134
|
+
def failed_to_parse(self) -> bool:
|
|
135
|
+
"""
|
|
136
|
+
Did this document entirely fail to parse?
|
|
137
|
+
|
|
138
|
+
:return: `True` if there was a complete parser failure, otherwise `False`
|
|
139
|
+
"""
|
|
140
|
+
if "error" in self._xml.root_element:
|
|
141
|
+
return True
|
|
142
|
+
return False
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
class CannotPublishUnpublishableDocument(Exception):
|
|
2
|
+
"""A document which has failed publication safety checks in `Document.is_publishable` cannot be published."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DocumentNotSafeForDeletion(Exception):
|
|
6
|
+
"""A document which is not safe for deletion cannot be deleted."""
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
DOCUMENT_STATUS_HOLD = "On hold"
|
|
2
|
+
""" This document has been placed on hold to actively prevent publication. """
|
|
3
|
+
|
|
4
|
+
DOCUMENT_STATUS_PUBLISHED = "Published"
|
|
5
|
+
""" This document has been published and should be considered publicly visible. """
|
|
6
|
+
|
|
7
|
+
DOCUMENT_STATUS_IN_PROGRESS = "In progress"
|
|
8
|
+
""" This document has not been published or put on hold, and has been picked up by an editor and
|
|
9
|
+
should be progressing through the document pipeline. """
|
|
10
|
+
|
|
11
|
+
DOCUMENT_STATUS_NEW = "New"
|
|
12
|
+
""" This document isn't published, on hold, or assigned, and can be picked up by an editor in the future. """
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from lxml import etree
|
|
2
|
+
|
|
3
|
+
from caselawclient.xml_helpers import get_xpath_match_string, get_xpath_match_strings
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NonXMLDocumentError(Exception):
|
|
7
|
+
"""A document cannot be parsed as XML."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class XML:
|
|
11
|
+
"""
|
|
12
|
+
A class for interacting with the raw XML of a document.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, xml_bytestring: bytes):
|
|
16
|
+
"""
|
|
17
|
+
:raises NonXMLDocumentError: This document is not valid XML
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
self.xml_as_tree: etree.Element = etree.fromstring(xml_bytestring)
|
|
21
|
+
except etree.XMLSyntaxError:
|
|
22
|
+
raise NonXMLDocumentError
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def xml_as_string(self) -> str:
|
|
26
|
+
"""
|
|
27
|
+
:return: A string representation of this document's XML tree.
|
|
28
|
+
"""
|
|
29
|
+
return str(etree.tostring(self.xml_as_tree).decode(encoding="utf-8"))
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def root_element(self) -> str:
|
|
33
|
+
return str(self.xml_as_tree.tag)
|
|
34
|
+
|
|
35
|
+
def get_xpath_match_string(self, xpath: str, namespaces: dict[str, str]) -> str:
|
|
36
|
+
return get_xpath_match_string(self.xml_as_tree, xpath, namespaces)
|
|
37
|
+
|
|
38
|
+
def get_xpath_match_strings(
|
|
39
|
+
self,
|
|
40
|
+
xpath: str,
|
|
41
|
+
namespaces: dict[str, str],
|
|
42
|
+
) -> list[str]:
|
|
43
|
+
return get_xpath_match_strings(self.xml_as_tree, xpath, namespaces)
|
|
@@ -8,7 +8,6 @@ from caselawclient.models.neutral_citation_mixin import NeutralCitationMixin
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from caselawclient.models.press_summaries import PressSummary
|
|
10
10
|
|
|
11
|
-
from ..xml_helpers import get_xpath_match_string
|
|
12
11
|
from .documents import Document
|
|
13
12
|
|
|
14
13
|
|
|
@@ -25,8 +24,7 @@ class Judgment(NeutralCitationMixin, Document):
|
|
|
25
24
|
|
|
26
25
|
@cached_property
|
|
27
26
|
def neutral_citation(self) -> str:
|
|
28
|
-
return get_xpath_match_string(
|
|
29
|
-
self.xml.xml_as_tree,
|
|
27
|
+
return self.body.get_xpath_match_string(
|
|
30
28
|
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()",
|
|
31
29
|
{
|
|
32
30
|
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
|
|
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
6
6
|
|
|
7
7
|
from caselawclient.errors import DocumentNotFoundError
|
|
8
8
|
from caselawclient.models.neutral_citation_mixin import NeutralCitationMixin
|
|
9
|
-
from caselawclient.xml_helpers import get_xpath_match_string
|
|
10
9
|
|
|
11
10
|
from .documents import Document
|
|
12
11
|
|
|
@@ -27,8 +26,7 @@ class PressSummary(NeutralCitationMixin, Document):
|
|
|
27
26
|
|
|
28
27
|
@cached_property
|
|
29
28
|
def neutral_citation(self) -> str:
|
|
30
|
-
return get_xpath_match_string(
|
|
31
|
-
self.xml.xml_as_tree,
|
|
29
|
+
return self.body.get_xpath_match_string(
|
|
32
30
|
"/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()",
|
|
33
31
|
{
|
|
34
32
|
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
|
|
@@ -218,8 +218,7 @@ def build_new_key(old_key: str, new_uri: str) -> str:
|
|
|
218
218
|
if old_filename.endswith(".docx") or old_filename.endswith(".pdf"):
|
|
219
219
|
new_filename = new_uri.replace("/", "_")
|
|
220
220
|
return f"{new_uri}/{new_filename}.{old_filename.split('.')[-1]}"
|
|
221
|
-
|
|
222
|
-
return f"{new_uri}/{old_filename}"
|
|
221
|
+
return f"{new_uri}/{old_filename}"
|
|
223
222
|
|
|
224
223
|
|
|
225
224
|
def request_parse(
|
|
@@ -250,6 +249,8 @@ def request_parse(
|
|
|
250
249
|
},
|
|
251
250
|
}
|
|
252
251
|
|
|
252
|
+
# breakpoint()
|
|
253
|
+
|
|
253
254
|
client.publish(
|
|
254
255
|
TopicArn=env("REPARSE_SNS_TOPIC"),
|
|
255
256
|
Message=json.dumps(message_to_send),
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from typing import List, Optional
|
|
3
|
-
from xml.etree.ElementTree import (
|
|
4
|
-
Element,
|
|
5
|
-
ElementTree,
|
|
6
|
-
ParseError,
|
|
7
|
-
QName,
|
|
8
|
-
fromstring,
|
|
9
|
-
tostring,
|
|
10
|
-
)
|
|
11
|
-
|
|
12
|
-
akn_uk_namespaces = {
|
|
13
|
-
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
|
|
14
|
-
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
|
|
15
|
-
}
|
|
16
|
-
akn_namespace_uri = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"
|
|
17
|
-
uk_namespace_uri = "https://caselaw.nationalarchives.gov.uk/akn"
|
|
18
|
-
search_namespace = {"search": "http://marklogic.com/appservices/search"}
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class JudgmentMissingMetadataError(IndexError):
|
|
22
|
-
pass
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def get_element(
|
|
26
|
-
xml: ElementTree,
|
|
27
|
-
xpath: str,
|
|
28
|
-
element_name: str = "FRBRname",
|
|
29
|
-
element_namespace: str = akn_namespace_uri,
|
|
30
|
-
has_value_attribute: bool = True,
|
|
31
|
-
) -> Element:
|
|
32
|
-
logging.warning(
|
|
33
|
-
"XMLTools is deprecated and will be removed in later versions. "
|
|
34
|
-
"Use methods from MarklogicApiClient.Client instead.",
|
|
35
|
-
)
|
|
36
|
-
name = xml.find(
|
|
37
|
-
xpath,
|
|
38
|
-
namespaces=akn_uk_namespaces,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
if name is None:
|
|
42
|
-
element = Element(QName(element_namespace, element_name)) # type: ignore
|
|
43
|
-
if has_value_attribute:
|
|
44
|
-
element.set("value", "")
|
|
45
|
-
return element
|
|
46
|
-
|
|
47
|
-
return name
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def get_neutral_citation_element(xml: ElementTree) -> Element:
|
|
51
|
-
return get_element(xml, ".//uk:cite", "cite", uk_namespace_uri, False)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def get_neutral_citation_name_value(xml: ElementTree) -> Optional[str]:
|
|
55
|
-
return get_neutral_citation_element(xml).text
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def get_judgment_date_element(xml: ElementTree) -> Element:
|
|
59
|
-
logging.warning(
|
|
60
|
-
"XMLTools is deprecated and will be removed in later versions. "
|
|
61
|
-
"Use methods from MarklogicApiClient.Client instead.",
|
|
62
|
-
)
|
|
63
|
-
name = xml.find(
|
|
64
|
-
".//akn:FRBRWork/akn:FRBRdate",
|
|
65
|
-
namespaces=akn_uk_namespaces,
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
if name is None:
|
|
69
|
-
element = Element(QName(akn_namespace_uri, "FRBRdate")) # type: ignore
|
|
70
|
-
element.set("date", "")
|
|
71
|
-
element.set("name", "judgment")
|
|
72
|
-
|
|
73
|
-
return element
|
|
74
|
-
|
|
75
|
-
return name
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def get_judgment_date_value(xml: ElementTree) -> str:
|
|
79
|
-
return get_judgment_date_element(xml).attrib["date"]
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def get_court_element(xml: ElementTree) -> Element:
|
|
83
|
-
return get_element(xml, ".//uk:court", "court", uk_namespace_uri, False)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def get_court_value(xml: ElementTree) -> Optional[str]:
|
|
87
|
-
return get_court_element(xml).text
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def get_metadata_name_element(xml: ElementTree) -> Element:
|
|
91
|
-
return get_element(xml, ".//akn:FRBRname", "FRBRname", akn_namespace_uri, True)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def get_metadata_name_value(xml: ElementTree) -> str:
|
|
95
|
-
name = get_metadata_name_element(xml)
|
|
96
|
-
value = name.attrib["value"]
|
|
97
|
-
if value is None:
|
|
98
|
-
return ""
|
|
99
|
-
return value
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def get_search_matches(element: ElementTree) -> List[str]:
|
|
103
|
-
logging.warning(
|
|
104
|
-
"XMLTools is deprecated and will be removed in later versions. "
|
|
105
|
-
"Use methods from MarklogicApiClient.Client instead.",
|
|
106
|
-
)
|
|
107
|
-
nodes = element.findall(".//search:match", namespaces=search_namespace)
|
|
108
|
-
results = []
|
|
109
|
-
for node in nodes:
|
|
110
|
-
text = tostring(node, method="text", encoding="UTF-8")
|
|
111
|
-
results.append(text.decode("UTF-8").strip())
|
|
112
|
-
return results
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def get_error_code(content_as_xml: Optional[str]) -> str:
|
|
116
|
-
logging.warning(
|
|
117
|
-
"XMLTools is deprecated and will be removed in later versions. "
|
|
118
|
-
"Use methods from MarklogicApiClient.Client instead.",
|
|
119
|
-
)
|
|
120
|
-
if not content_as_xml:
|
|
121
|
-
return "Unknown error, Marklogic returned a null or empty response"
|
|
122
|
-
try:
|
|
123
|
-
xml = fromstring(content_as_xml)
|
|
124
|
-
return xml.find(
|
|
125
|
-
"message-code",
|
|
126
|
-
namespaces={"": "http://marklogic.com/xdmp/error"},
|
|
127
|
-
).text # type: ignore
|
|
128
|
-
except (ParseError, TypeError, AttributeError):
|
|
129
|
-
return "Unknown error, Marklogic returned a null or empty response"
|
{ds_caselaw_marklogic_api_client-24.0.1 → ds_caselaw_marklogic_api_client-26.0.0}/LICENSE.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|