python-documentcloud 4.4.1__tar.gz → 4.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/PKG-INFO +1 -1
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/addon.py +20 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/client.py +4 -8
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/documents.py +112 -145
- python_documentcloud-4.6.0/documentcloud/exceptions.py +15 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/PKG-INFO +1 -1
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/SOURCES.txt +1 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/setup.py +1 -1
- python_documentcloud-4.6.0/tests/test_addon.py +141 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_documents.py +1 -4
- python_documentcloud-4.4.1/documentcloud/exceptions.py +0 -12
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/LICENSE +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/README.md +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/__init__.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/annotations.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/base.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/constants.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/organizations.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/projects.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/sections.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/toolbox.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/users.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/dependency_links.txt +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/requires.txt +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/top_level.txt +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/setup.cfg +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_annotations.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_base.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_client.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_organizations.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_projects.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_sections.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_toolbox.py +0 -0
- {python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_users.py +0 -0
|
@@ -182,6 +182,26 @@ class AddOn(BaseAddOn):
|
|
|
182
182
|
f"addon_runs/{self.id}/", json={"file_name": file_name}
|
|
183
183
|
)
|
|
184
184
|
|
|
185
|
+
def load_run_data(self):
|
|
186
|
+
"Load persistent data from this run"
|
|
187
|
+
if not self.id:
|
|
188
|
+
return {}
|
|
189
|
+
|
|
190
|
+
response = self.client.get(f"addon_runs/{self.id}/")
|
|
191
|
+
response.raise_for_status()
|
|
192
|
+
return response.json().get("data", {})
|
|
193
|
+
|
|
194
|
+
def store_run_data(self, data):
|
|
195
|
+
"Store persistent data for this run"
|
|
196
|
+
if not self.id:
|
|
197
|
+
print("Run ID not set. Try again later or check if something went wrong.")
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
if not isinstance(data, dict):
|
|
201
|
+
raise TypeError("Invalid data")
|
|
202
|
+
|
|
203
|
+
return self.client.patch(f"addon_runs/{self.id}/", json={"data": data})
|
|
204
|
+
|
|
185
205
|
def load_event_data(self):
|
|
186
206
|
"""Load persistent data for this event"""
|
|
187
207
|
if not self.event_id:
|
|
@@ -14,11 +14,12 @@ from .users import UserClient
|
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger("documentcloud")
|
|
16
16
|
|
|
17
|
+
|
|
17
18
|
class DocumentCloud(SquareletClient):
|
|
18
19
|
"""
|
|
19
20
|
The public interface for the DocumentCloud API, now integrated with SquareletClient
|
|
20
21
|
"""
|
|
21
|
-
|
|
22
|
+
|
|
22
23
|
def __init__(
|
|
23
24
|
self,
|
|
24
25
|
username=None,
|
|
@@ -30,7 +31,7 @@ class DocumentCloud(SquareletClient):
|
|
|
30
31
|
rate_limit=True,
|
|
31
32
|
rate_limit_sleep=True,
|
|
32
33
|
):
|
|
33
|
-
|
|
34
|
+
# Initialize SquareletClient for authentication and request handling
|
|
34
35
|
super().__init__(
|
|
35
36
|
base_uri=base_uri,
|
|
36
37
|
username=username,
|
|
@@ -38,7 +39,7 @@ class DocumentCloud(SquareletClient):
|
|
|
38
39
|
auth_uri=auth_uri,
|
|
39
40
|
timeout=timeout,
|
|
40
41
|
rate_limit=rate_limit,
|
|
41
|
-
rate_limit_sleep=rate_limit_sleep
|
|
42
|
+
rate_limit_sleep=rate_limit_sleep,
|
|
42
43
|
)
|
|
43
44
|
|
|
44
45
|
# Set up logging
|
|
@@ -55,8 +56,3 @@ class DocumentCloud(SquareletClient):
|
|
|
55
56
|
self.projects = ProjectClient(self)
|
|
56
57
|
self.users = UserClient(self)
|
|
57
58
|
self.organizations = OrganizationClient(self)
|
|
58
|
-
|
|
59
|
-
"""def _request(self, method, url, raise_error=True, **kwargs):
|
|
60
|
-
Delegates request to the SquareletClient's _request method
|
|
61
|
-
return self.squarelet_client.request(method, url, raise_error, **kwargs)
|
|
62
|
-
"""
|
|
@@ -9,6 +9,7 @@ import os
|
|
|
9
9
|
import re
|
|
10
10
|
import warnings
|
|
11
11
|
from functools import partial
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
# Third Party
|
|
14
15
|
from requests.exceptions import RequestException
|
|
@@ -23,11 +24,6 @@ from .sections import SectionClient
|
|
|
23
24
|
from .toolbox import grouper, is_url, merge_dicts, requests_retry_session
|
|
24
25
|
from .users import User
|
|
25
26
|
|
|
26
|
-
try:
|
|
27
|
-
from urllib.parse import urlparse
|
|
28
|
-
except ImportError:
|
|
29
|
-
from urlparse import urlparse
|
|
30
|
-
|
|
31
27
|
logger = logging.getLogger("documentcloud")
|
|
32
28
|
|
|
33
29
|
IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]
|
|
@@ -74,8 +70,11 @@ class Document(BaseAPIObject):
|
|
|
74
70
|
def __getattr__(self, attr):
|
|
75
71
|
"""Generate methods for fetching resources"""
|
|
76
72
|
p_image = re.compile(
|
|
77
|
-
r"^get_
|
|
73
|
+
r"^get_"
|
|
74
|
+
r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
|
|
75
|
+
r"(?P<list>_list)?$"
|
|
78
76
|
)
|
|
77
|
+
|
|
79
78
|
get = attr.startswith("get_")
|
|
80
79
|
url = attr.endswith("_url")
|
|
81
80
|
text = attr.endswith("_text")
|
|
@@ -230,9 +229,15 @@ class Document(BaseAPIObject):
|
|
|
230
229
|
|
|
231
230
|
return all_results
|
|
232
231
|
|
|
233
|
-
def process(self):
|
|
234
|
-
"""
|
|
235
|
-
|
|
232
|
+
def process(self, **kwargs):
|
|
233
|
+
"""Process the document, used on upload and for reprocessing"""
|
|
234
|
+
payload = {}
|
|
235
|
+
if "force_ocr" in kwargs:
|
|
236
|
+
payload["force_ocr"] = kwargs["force_ocr"]
|
|
237
|
+
if "ocr_engine" in kwargs:
|
|
238
|
+
payload["ocr_engine"] = kwargs["ocr_engine"]
|
|
239
|
+
|
|
240
|
+
self._client.post(f"{self.api_path}/{self.id}/process/", json=payload)
|
|
236
241
|
|
|
237
242
|
|
|
238
243
|
class DocumentClient(BaseAPIClient):
|
|
@@ -310,6 +315,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
310
315
|
"title",
|
|
311
316
|
"data",
|
|
312
317
|
"force_ocr",
|
|
318
|
+
"ocr_engine",
|
|
313
319
|
"projects",
|
|
314
320
|
"delayed_index",
|
|
315
321
|
"revision_control",
|
|
@@ -333,21 +339,55 @@ class DocumentClient(BaseAPIClient):
|
|
|
333
339
|
|
|
334
340
|
return params
|
|
335
341
|
|
|
342
|
+
def _extract_ocr_options(self, kwargs):
|
|
343
|
+
"""
|
|
344
|
+
Extract and validate OCR options from kwargs.
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
force_ocr (bool)
|
|
348
|
+
ocr_engine (str)
|
|
349
|
+
"""
|
|
350
|
+
force_ocr = kwargs.pop("force_ocr", False)
|
|
351
|
+
ocr_engine = kwargs.pop("ocr_engine", "tess4")
|
|
352
|
+
|
|
353
|
+
if not isinstance(force_ocr, bool):
|
|
354
|
+
raise ValueError("force_ocr must be a boolean")
|
|
355
|
+
|
|
356
|
+
if ocr_engine and ocr_engine not in ("tess4", "textract"):
|
|
357
|
+
raise ValueError(
|
|
358
|
+
"ocr_engine must be either 'tess4' for tesseract or 'textract'"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return force_ocr, ocr_engine
|
|
362
|
+
|
|
336
363
|
def _get_title(self, name):
|
|
337
364
|
"""Get the default title for a document from its path"""
|
|
338
365
|
return name.split(os.sep)[-1].rsplit(".", 1)[0]
|
|
339
366
|
|
|
340
367
|
def _upload_url(self, file_url, **kwargs):
|
|
341
368
|
"""Upload a document from a publicly accessible URL"""
|
|
369
|
+
# extract process-related args
|
|
370
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
371
|
+
|
|
372
|
+
# create the document
|
|
342
373
|
params = self._format_upload_parameters(file_url, **kwargs)
|
|
343
374
|
params["file_url"] = file_url
|
|
375
|
+
if force_ocr:
|
|
376
|
+
params["force_ocr"] = force_ocr
|
|
377
|
+
params["ocr_engine"] = ocr_engine
|
|
344
378
|
response = self.client.post("documents/", json=params)
|
|
345
|
-
|
|
379
|
+
create_json = response.json()
|
|
380
|
+
|
|
381
|
+
# wrap in Document object
|
|
382
|
+
doc = Document(self.client, create_json)
|
|
383
|
+
|
|
384
|
+
return doc
|
|
346
385
|
|
|
347
386
|
def _upload_file(self, file_, **kwargs):
|
|
348
387
|
"""Upload a document directly"""
|
|
349
388
|
# create the document
|
|
350
|
-
force_ocr =
|
|
389
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
390
|
+
|
|
351
391
|
params = self._format_upload_parameters(file_.name, **kwargs)
|
|
352
392
|
response = self.client.post("documents/", json=params)
|
|
353
393
|
|
|
@@ -357,12 +397,12 @@ class DocumentClient(BaseAPIClient):
|
|
|
357
397
|
response = requests_retry_session().put(presigned_url, data=file_.read())
|
|
358
398
|
|
|
359
399
|
# begin processing the document
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
)
|
|
400
|
+
doc = Document(self.client, create_json)
|
|
401
|
+
|
|
402
|
+
# begin processing
|
|
403
|
+
doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
|
|
364
404
|
|
|
365
|
-
return
|
|
405
|
+
return doc
|
|
366
406
|
|
|
367
407
|
def _collect_files(self, path, extensions):
|
|
368
408
|
"""Find the paths to files with specified extensions under a directory"""
|
|
@@ -379,171 +419,98 @@ class DocumentClient(BaseAPIClient):
|
|
|
379
419
|
|
|
380
420
|
def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
|
|
381
421
|
"""Upload files with specified extensions in a directory"""
|
|
382
|
-
# pylint:
|
|
383
|
-
|
|
384
|
-
# Do not set the same title for all documents
|
|
422
|
+
# pylint:disable=too-many-locals
|
|
385
423
|
kwargs.pop("title", None)
|
|
386
424
|
|
|
387
|
-
# If extensions are specified as None, it will check for all supported
|
|
388
|
-
# filetypes.
|
|
389
425
|
if extensions is None:
|
|
390
426
|
extensions = SUPPORTED_EXTENSIONS
|
|
391
|
-
|
|
392
|
-
# Convert single extension to a list if provided
|
|
393
427
|
if extensions and not isinstance(extensions, list):
|
|
394
428
|
extensions = [extensions]
|
|
395
|
-
|
|
396
|
-
# Checks to see if the extensions are supported, raises an error if not.
|
|
397
429
|
invalid_extensions = set(extensions) - set(SUPPORTED_EXTENSIONS)
|
|
398
430
|
if invalid_extensions:
|
|
399
431
|
raise ValueError(
|
|
400
432
|
f"Invalid extensions provided: {', '.join(invalid_extensions)}"
|
|
401
433
|
)
|
|
402
434
|
|
|
403
|
-
# Loop through the path and get all the files with matching extensions
|
|
404
435
|
path_list = self._collect_files(path, extensions)
|
|
405
|
-
|
|
406
436
|
logger.info(
|
|
407
|
-
"Upload directory on %s: Found %d files to upload",
|
|
408
|
-
path,
|
|
409
|
-
len(path_list)
|
|
437
|
+
"Upload directory on %s: Found %d files to upload", path, len(path_list)
|
|
410
438
|
)
|
|
411
439
|
|
|
412
|
-
# Upload all the files using the bulk API to reduce the number
|
|
413
|
-
# of API calls and improve performance
|
|
414
440
|
obj_list = []
|
|
441
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
415
442
|
params = self._format_upload_parameters("", **kwargs)
|
|
443
|
+
|
|
416
444
|
for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
|
|
417
|
-
# Grouper will put None's on the end of the last group
|
|
418
445
|
file_paths = [p for p in file_paths if p is not None]
|
|
419
|
-
|
|
420
446
|
logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
|
|
421
447
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
json=[
|
|
428
|
-
merge_dicts(
|
|
429
|
-
params,
|
|
430
|
-
{
|
|
431
|
-
"title": self._get_title(p),
|
|
432
|
-
"original_extension": os.path.splitext(
|
|
433
|
-
os.path.basename(p)
|
|
434
|
-
)[1]
|
|
435
|
-
.lower()
|
|
436
|
-
.lstrip("."),
|
|
437
|
-
},
|
|
438
|
-
)
|
|
439
|
-
for p in file_paths
|
|
440
|
-
],
|
|
441
|
-
)
|
|
442
|
-
except (APIError, RequestException) as exc:
|
|
443
|
-
if handle_errors:
|
|
444
|
-
logger.info(
|
|
445
|
-
"Error creating the following documents: %s\n%s",
|
|
446
|
-
exc,
|
|
447
|
-
"\n".join(file_paths)
|
|
448
|
-
)
|
|
449
|
-
continue
|
|
450
|
-
else:
|
|
451
|
-
raise
|
|
448
|
+
create_json = self._create_documents(file_paths, params, handle_errors)
|
|
449
|
+
sorted_create_json = sorted(create_json, key=lambda j: j["title"])
|
|
450
|
+
sorted_file_paths = sorted(file_paths, key=self._get_title)
|
|
451
|
+
obj_list.extend(sorted_create_json)
|
|
452
|
+
presigned_urls = [j["presigned_url"] for j in sorted_create_json]
|
|
452
453
|
|
|
453
|
-
|
|
454
|
-
create_json
|
|
455
|
-
obj_list.extend(create_json)
|
|
456
|
-
presigned_urls = [j["presigned_url"] for j in create_json]
|
|
457
|
-
for url, file_path in zip(presigned_urls, file_paths):
|
|
458
|
-
logger.info("Uploading %s to S3...", file_path)
|
|
459
|
-
try:
|
|
460
|
-
with open(file_path, "rb") as file:
|
|
461
|
-
response = requests_retry_session().put(url, data=file.read())
|
|
462
|
-
self.client.raise_for_status(response)
|
|
463
|
-
except (APIError, RequestException) as exc:
|
|
464
|
-
if handle_errors:
|
|
465
|
-
logger.info(
|
|
466
|
-
"Error uploading the following document: %s %s",
|
|
467
|
-
exc,
|
|
468
|
-
file_path
|
|
469
|
-
)
|
|
470
|
-
continue
|
|
471
|
-
else:
|
|
472
|
-
raise
|
|
473
|
-
|
|
474
|
-
# Begin processing the documents
|
|
475
|
-
logger.info("Processing the documents...")
|
|
476
|
-
doc_ids = [j["id"] for j in create_json]
|
|
477
|
-
try:
|
|
478
|
-
response = self.client.post("documents/process/", json={"ids": doc_ids})
|
|
479
|
-
except (APIError, RequestException) as exc:
|
|
480
|
-
if handle_errors:
|
|
481
|
-
logger.info(
|
|
482
|
-
"Error creating the following documents: %s\n%s",
|
|
483
|
-
exc,
|
|
484
|
-
"\n".join(file_paths)
|
|
485
|
-
)
|
|
486
|
-
continue
|
|
487
|
-
else:
|
|
488
|
-
raise
|
|
454
|
+
self._upload_files_to_s3(sorted_file_paths, presigned_urls, handle_errors)
|
|
455
|
+
self._process_documents(create_json, force_ocr, ocr_engine, handle_errors)
|
|
489
456
|
|
|
490
457
|
logger.info("Upload directory complete")
|
|
491
|
-
|
|
492
|
-
# Pass back the list of documents
|
|
493
458
|
return [Document(self.client, d) for d in obj_list]
|
|
494
459
|
|
|
495
|
-
def
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
url_group = [url for url in url_group if url is not None]
|
|
506
|
-
|
|
507
|
-
logger.info(
|
|
508
|
-
"Uploading group %d: %s",
|
|
509
|
-
i + 1,
|
|
510
|
-
"\n".join(url_group)
|
|
460
|
+
def _create_documents(self, file_paths, params, handle_errors):
|
|
461
|
+
body = [
|
|
462
|
+
merge_dicts(
|
|
463
|
+
params,
|
|
464
|
+
{
|
|
465
|
+
"title": self._get_title(p),
|
|
466
|
+
"original_extension": os.path.splitext(os.path.basename(p))[1]
|
|
467
|
+
.lower()
|
|
468
|
+
.lstrip("."),
|
|
469
|
+
},
|
|
511
470
|
)
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
"title": self._get_title(url),
|
|
523
|
-
"file_url": url,
|
|
524
|
-
},
|
|
525
|
-
)
|
|
526
|
-
for url in url_group
|
|
527
|
-
],
|
|
471
|
+
for p in sorted(file_paths)
|
|
472
|
+
]
|
|
473
|
+
try:
|
|
474
|
+
response = self.client.post("documents/", json=body)
|
|
475
|
+
except (APIError, RequestException) as exc:
|
|
476
|
+
if handle_errors:
|
|
477
|
+
logger.info(
|
|
478
|
+
"Error creating the following documents: %s\n%s",
|
|
479
|
+
exc,
|
|
480
|
+
"\n".join(file_paths),
|
|
528
481
|
)
|
|
482
|
+
return []
|
|
483
|
+
else:
|
|
484
|
+
raise
|
|
485
|
+
return response.json()
|
|
486
|
+
|
|
487
|
+
def _upload_files_to_s3(self, file_paths, presigned_urls, handle_errors):
|
|
488
|
+
for url, file_path in zip(presigned_urls, file_paths):
|
|
489
|
+
logger.info("Uploading %s to S3...", file_path)
|
|
490
|
+
try:
|
|
491
|
+
with open(file_path, "rb") as f:
|
|
492
|
+
response = requests_retry_session().put(url, data=f.read())
|
|
493
|
+
self.client.raise_for_status(response)
|
|
529
494
|
except (APIError, RequestException) as exc:
|
|
530
495
|
if handle_errors:
|
|
531
496
|
logger.info(
|
|
532
|
-
"Error
|
|
533
|
-
str(exc),
|
|
534
|
-
"\n".join(url_group)
|
|
497
|
+
"Error uploading the following document: %s %s", exc, file_path
|
|
535
498
|
)
|
|
536
|
-
continue
|
|
537
499
|
else:
|
|
538
500
|
raise
|
|
539
501
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
502
|
+
def _process_documents(self, create_json, force_ocr, ocr_engine, handle_errors):
|
|
503
|
+
payload = [
|
|
504
|
+
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
|
|
505
|
+
for j in create_json
|
|
506
|
+
]
|
|
507
|
+
try:
|
|
508
|
+
self.client.post("documents/process/", json=payload)
|
|
509
|
+
except (APIError, RequestException) as exc:
|
|
510
|
+
if handle_errors:
|
|
511
|
+
logger.info("Error processing documents: %s", exc)
|
|
512
|
+
else:
|
|
513
|
+
raise
|
|
547
514
|
|
|
548
515
|
|
|
549
516
|
class Mention:
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom exceptions for python-documentcloud
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# Third Party
|
|
6
|
+
# pylint: disable=unused-import
|
|
7
|
+
# Import exceptions from python-squarelet
|
|
8
|
+
from squarelet.exceptions import (
|
|
9
|
+
APIError,
|
|
10
|
+
CredentialsFailedError,
|
|
11
|
+
DoesNotExistError,
|
|
12
|
+
DuplicateObjectError,
|
|
13
|
+
MultipleObjectsReturnedError,
|
|
14
|
+
SquareletError as DocumentCloudError,
|
|
15
|
+
)
|
{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/SOURCES.txt
RENAMED
|
@@ -20,6 +20,7 @@ python_documentcloud.egg-info/SOURCES.txt
|
|
|
20
20
|
python_documentcloud.egg-info/dependency_links.txt
|
|
21
21
|
python_documentcloud.egg-info/requires.txt
|
|
22
22
|
python_documentcloud.egg-info/top_level.txt
|
|
23
|
+
tests/test_addon.py
|
|
23
24
|
tests/test_annotations.py
|
|
24
25
|
tests/test_base.py
|
|
25
26
|
tests/test_client.py
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Standard Library
|
|
2
|
+
from unittest.mock import MagicMock
|
|
3
|
+
|
|
4
|
+
# Third Party
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
# DocumentCloud
|
|
8
|
+
from documentcloud.addon import AddOn
|
|
9
|
+
|
|
10
|
+
# pylint: disable=redefined-outer-name
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.fixture
|
|
14
|
+
def addon():
|
|
15
|
+
"""An AddOn instance built without invoking argparse or constructing a real client.
|
|
16
|
+
|
|
17
|
+
Tests can override `.id`, `.event_id`, `.client`, etc. as needed.
|
|
18
|
+
"""
|
|
19
|
+
instance = AddOn.__new__(AddOn)
|
|
20
|
+
instance.id = "run-123"
|
|
21
|
+
instance.addon_id = "addon-1"
|
|
22
|
+
instance.event_id = None
|
|
23
|
+
instance.documents = None
|
|
24
|
+
instance.query = None
|
|
25
|
+
instance.user_id = None
|
|
26
|
+
instance.org_id = None
|
|
27
|
+
instance.data = {}
|
|
28
|
+
instance.title = "Test AddOn"
|
|
29
|
+
instance.client = MagicMock()
|
|
30
|
+
return instance
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TestLoadRunData:
|
|
34
|
+
def test_returns_data_when_run_id_set(self, addon):
|
|
35
|
+
addon.client.get.return_value.json.return_value = {"data": {"foo": "bar"}}
|
|
36
|
+
|
|
37
|
+
result = addon.load_run_data()
|
|
38
|
+
|
|
39
|
+
addon.client.get.assert_called_once_with("addon_runs/run-123/")
|
|
40
|
+
assert result == {"foo": "bar"}
|
|
41
|
+
|
|
42
|
+
def test_returns_empty_dict_when_no_run_id(self, addon):
|
|
43
|
+
addon.id = None
|
|
44
|
+
|
|
45
|
+
assert addon.load_run_data() == {}
|
|
46
|
+
addon.client.get.assert_not_called()
|
|
47
|
+
|
|
48
|
+
def test_returns_empty_dict_when_data_missing_from_response(self, addon):
|
|
49
|
+
addon.client.get.return_value.json.return_value = {}
|
|
50
|
+
|
|
51
|
+
assert addon.load_run_data() == {}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class TestStoreRunData:
|
|
55
|
+
def test_patches_run_with_data(self, addon):
|
|
56
|
+
addon.store_run_data({"foo": "bar"})
|
|
57
|
+
|
|
58
|
+
addon.client.patch.assert_called_once_with(
|
|
59
|
+
"addon_runs/run-123/", json={"data": {"foo": "bar"}}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def test_no_op_when_no_run_id(self, addon, capsys):
|
|
63
|
+
addon.id = None
|
|
64
|
+
|
|
65
|
+
result = addon.store_run_data({"foo": "bar"})
|
|
66
|
+
|
|
67
|
+
assert result is None
|
|
68
|
+
addon.client.patch.assert_not_called()
|
|
69
|
+
assert "Run ID not set" in capsys.readouterr().out
|
|
70
|
+
|
|
71
|
+
def test_rejects_non_dict_data(self, addon):
|
|
72
|
+
with pytest.raises(TypeError):
|
|
73
|
+
addon.store_run_data("not a dict")
|
|
74
|
+
|
|
75
|
+
addon.client.patch.assert_not_called()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class TestLoadEventData:
|
|
79
|
+
def test_returns_scratch_when_event_id_set(self, addon):
|
|
80
|
+
addon.event_id = "evt-9"
|
|
81
|
+
addon.client.get.return_value.json.return_value = {"scratch": {"x": 1}}
|
|
82
|
+
|
|
83
|
+
result = addon.load_event_data()
|
|
84
|
+
|
|
85
|
+
addon.client.get.assert_called_once_with("addon_events/evt-9/")
|
|
86
|
+
assert result == {"x": 1}
|
|
87
|
+
|
|
88
|
+
def test_returns_none_when_no_event_id(self, addon):
|
|
89
|
+
assert addon.load_event_data() is None
|
|
90
|
+
addon.client.get.assert_not_called()
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TestStoreEventData:
|
|
94
|
+
def test_patches_event_with_scratch(self, addon):
|
|
95
|
+
addon.event_id = "evt-9"
|
|
96
|
+
|
|
97
|
+
addon.store_event_data({"x": 1})
|
|
98
|
+
|
|
99
|
+
addon.client.patch.assert_called_once_with(
|
|
100
|
+
"addon_events/evt-9/", json={"scratch": {"x": 1}}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def test_no_op_when_no_event_id(self, addon):
|
|
104
|
+
assert addon.store_event_data({"x": 1}) is None
|
|
105
|
+
addon.client.patch.assert_not_called()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@pytest.fixture
|
|
109
|
+
def real_addon(client, addon_run):
|
|
110
|
+
"""An AddOn wired to the real `client` fixture and a freshly created run."""
|
|
111
|
+
instance = AddOn.__new__(AddOn)
|
|
112
|
+
instance.id = addon_run
|
|
113
|
+
instance.addon_id = None
|
|
114
|
+
instance.event_id = None
|
|
115
|
+
instance.documents = None
|
|
116
|
+
instance.query = None
|
|
117
|
+
instance.user_id = None
|
|
118
|
+
instance.org_id = None
|
|
119
|
+
instance.data = {}
|
|
120
|
+
instance.title = "Test AddOn"
|
|
121
|
+
instance.client = client
|
|
122
|
+
return instance
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class TestRunDataVCR:
|
|
126
|
+
"""VCR-recorded round-trip tests against the dev DC.
|
|
127
|
+
|
|
128
|
+
Recording: set DC_TEST_ADDON_RUN_ID to an existing AddOnRun UUID on your
|
|
129
|
+
local dev DC, then run `make test-dev` (or `pytest --record-mode=new_episodes`).
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def test_load_run_data_returns_dict(self, real_addon):
|
|
133
|
+
result = real_addon.load_run_data()
|
|
134
|
+
assert isinstance(result, dict)
|
|
135
|
+
|
|
136
|
+
def test_store_then_load_run_data_round_trip(self, real_addon):
|
|
137
|
+
payload = {"foo": "bar", "n": 42}
|
|
138
|
+
real_addon.store_run_data(payload)
|
|
139
|
+
loaded = real_addon.load_run_data()
|
|
140
|
+
assert loaded.get("foo") == "bar"
|
|
141
|
+
assert loaded.get("n") == 42
|
|
@@ -158,9 +158,7 @@ class TestDocument:
|
|
|
158
158
|
|
|
159
159
|
class TestDocumentClient:
|
|
160
160
|
def test_search(self, client, document):
|
|
161
|
-
documents = client.documents.search(
|
|
162
|
-
f"document:{document.id} simple"
|
|
163
|
-
)
|
|
161
|
+
documents = client.documents.search(f"document:{document.id} simple")
|
|
164
162
|
assert documents
|
|
165
163
|
|
|
166
164
|
def test_list(self, client):
|
|
@@ -182,7 +180,6 @@ class TestDocumentClient:
|
|
|
182
180
|
document = document_factory(pdf)
|
|
183
181
|
assert document.status == "success"
|
|
184
182
|
|
|
185
|
-
|
|
186
183
|
def test_upload_file_path(self, document_factory):
|
|
187
184
|
document = document_factory("tests/test.pdf")
|
|
188
185
|
assert document.status == "success"
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Custom exceptions for python-documentcloud
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
# pylint: disable=unused-import
|
|
6
|
-
# Import exceptions from python-squarelet
|
|
7
|
-
from squarelet.exceptions import SquareletError as DocumentCloudError
|
|
8
|
-
from squarelet.exceptions import DuplicateObjectError
|
|
9
|
-
from squarelet.exceptions import CredentialsFailedError
|
|
10
|
-
from squarelet.exceptions import APIError
|
|
11
|
-
from squarelet.exceptions import DoesNotExistError
|
|
12
|
-
from squarelet.exceptions import MultipleObjectsReturnedError
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/requires.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|