python-documentcloud 4.4.1__py2.py3-none-any.whl → 4.5.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documentcloud/client.py +4 -8
- documentcloud/documents.py +111 -140
- {python_documentcloud-4.4.1.dist-info → python_documentcloud-4.5.0.dist-info}/METADATA +1 -1
- {python_documentcloud-4.4.1.dist-info → python_documentcloud-4.5.0.dist-info}/RECORD +7 -7
- {python_documentcloud-4.4.1.dist-info → python_documentcloud-4.5.0.dist-info}/WHEEL +1 -1
- {python_documentcloud-4.4.1.dist-info → python_documentcloud-4.5.0.dist-info}/licenses/LICENSE +0 -0
- {python_documentcloud-4.4.1.dist-info → python_documentcloud-4.5.0.dist-info}/top_level.txt +0 -0
documentcloud/client.py
CHANGED
|
@@ -14,11 +14,12 @@ from .users import UserClient
|
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger("documentcloud")
|
|
16
16
|
|
|
17
|
+
|
|
17
18
|
class DocumentCloud(SquareletClient):
|
|
18
19
|
"""
|
|
19
20
|
The public interface for the DocumentCloud API, now integrated with SquareletClient
|
|
20
21
|
"""
|
|
21
|
-
|
|
22
|
+
|
|
22
23
|
def __init__(
|
|
23
24
|
self,
|
|
24
25
|
username=None,
|
|
@@ -30,7 +31,7 @@ class DocumentCloud(SquareletClient):
|
|
|
30
31
|
rate_limit=True,
|
|
31
32
|
rate_limit_sleep=True,
|
|
32
33
|
):
|
|
33
|
-
|
|
34
|
+
# Initialize SquareletClient for authentication and request handling
|
|
34
35
|
super().__init__(
|
|
35
36
|
base_uri=base_uri,
|
|
36
37
|
username=username,
|
|
@@ -38,7 +39,7 @@ class DocumentCloud(SquareletClient):
|
|
|
38
39
|
auth_uri=auth_uri,
|
|
39
40
|
timeout=timeout,
|
|
40
41
|
rate_limit=rate_limit,
|
|
41
|
-
rate_limit_sleep=rate_limit_sleep
|
|
42
|
+
rate_limit_sleep=rate_limit_sleep,
|
|
42
43
|
)
|
|
43
44
|
|
|
44
45
|
# Set up logging
|
|
@@ -55,8 +56,3 @@ class DocumentCloud(SquareletClient):
|
|
|
55
56
|
self.projects = ProjectClient(self)
|
|
56
57
|
self.users = UserClient(self)
|
|
57
58
|
self.organizations = OrganizationClient(self)
|
|
58
|
-
|
|
59
|
-
"""def _request(self, method, url, raise_error=True, **kwargs):
|
|
60
|
-
Delegates request to the SquareletClient's _request method
|
|
61
|
-
return self.squarelet_client.request(method, url, raise_error, **kwargs)
|
|
62
|
-
"""
|
documentcloud/documents.py
CHANGED
|
@@ -74,8 +74,11 @@ class Document(BaseAPIObject):
|
|
|
74
74
|
def __getattr__(self, attr):
|
|
75
75
|
"""Generate methods for fetching resources"""
|
|
76
76
|
p_image = re.compile(
|
|
77
|
-
r"^get_
|
|
77
|
+
r"^get_"
|
|
78
|
+
r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
|
|
79
|
+
r"(?P<list>_list)?$"
|
|
78
80
|
)
|
|
81
|
+
|
|
79
82
|
get = attr.startswith("get_")
|
|
80
83
|
url = attr.endswith("_url")
|
|
81
84
|
text = attr.endswith("_text")
|
|
@@ -230,9 +233,15 @@ class Document(BaseAPIObject):
|
|
|
230
233
|
|
|
231
234
|
return all_results
|
|
232
235
|
|
|
233
|
-
def process(self):
|
|
234
|
-
"""
|
|
235
|
-
|
|
236
|
+
def process(self, **kwargs):
|
|
237
|
+
"""Process the document, used on upload and for reprocessing"""
|
|
238
|
+
payload = {}
|
|
239
|
+
if "force_ocr" in kwargs:
|
|
240
|
+
payload["force_ocr"] = kwargs["force_ocr"]
|
|
241
|
+
if "ocr_engine" in kwargs:
|
|
242
|
+
payload["ocr_engine"] = kwargs["ocr_engine"]
|
|
243
|
+
|
|
244
|
+
self._client.post(f"{self.api_path}/{self.id}/process/", json=payload)
|
|
236
245
|
|
|
237
246
|
|
|
238
247
|
class DocumentClient(BaseAPIClient):
|
|
@@ -310,6 +319,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
310
319
|
"title",
|
|
311
320
|
"data",
|
|
312
321
|
"force_ocr",
|
|
322
|
+
"ocr_engine",
|
|
313
323
|
"projects",
|
|
314
324
|
"delayed_index",
|
|
315
325
|
"revision_control",
|
|
@@ -333,21 +343,55 @@ class DocumentClient(BaseAPIClient):
|
|
|
333
343
|
|
|
334
344
|
return params
|
|
335
345
|
|
|
346
|
+
def _extract_ocr_options(self, kwargs):
|
|
347
|
+
"""
|
|
348
|
+
Extract and validate OCR options from kwargs.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
force_ocr (bool)
|
|
352
|
+
ocr_engine (str)
|
|
353
|
+
"""
|
|
354
|
+
force_ocr = kwargs.pop("force_ocr", False)
|
|
355
|
+
ocr_engine = kwargs.pop("ocr_engine", "tess4")
|
|
356
|
+
|
|
357
|
+
if not isinstance(force_ocr, bool):
|
|
358
|
+
raise ValueError("force_ocr must be a boolean")
|
|
359
|
+
|
|
360
|
+
if ocr_engine and ocr_engine not in ("tess4", "textract"):
|
|
361
|
+
raise ValueError(
|
|
362
|
+
"ocr_engine must be either 'tess4' for tesseract or 'textract'"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
return force_ocr, ocr_engine
|
|
366
|
+
|
|
336
367
|
def _get_title(self, name):
|
|
337
368
|
"""Get the default title for a document from its path"""
|
|
338
369
|
return name.split(os.sep)[-1].rsplit(".", 1)[0]
|
|
339
370
|
|
|
340
371
|
def _upload_url(self, file_url, **kwargs):
|
|
341
372
|
"""Upload a document from a publicly accessible URL"""
|
|
373
|
+
# extract process-related args
|
|
374
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
375
|
+
|
|
376
|
+
# create the document
|
|
342
377
|
params = self._format_upload_parameters(file_url, **kwargs)
|
|
343
378
|
params["file_url"] = file_url
|
|
379
|
+
if force_ocr:
|
|
380
|
+
params["force_ocr"] = force_ocr
|
|
381
|
+
params["ocr_engine"] = ocr_engine
|
|
344
382
|
response = self.client.post("documents/", json=params)
|
|
345
|
-
|
|
383
|
+
create_json = response.json()
|
|
384
|
+
|
|
385
|
+
# wrap in Document object
|
|
386
|
+
doc = Document(self.client, create_json)
|
|
387
|
+
|
|
388
|
+
return doc
|
|
346
389
|
|
|
347
390
|
def _upload_file(self, file_, **kwargs):
|
|
348
391
|
"""Upload a document directly"""
|
|
349
392
|
# create the document
|
|
350
|
-
force_ocr =
|
|
393
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
394
|
+
|
|
351
395
|
params = self._format_upload_parameters(file_.name, **kwargs)
|
|
352
396
|
response = self.client.post("documents/", json=params)
|
|
353
397
|
|
|
@@ -357,12 +401,12 @@ class DocumentClient(BaseAPIClient):
|
|
|
357
401
|
response = requests_retry_session().put(presigned_url, data=file_.read())
|
|
358
402
|
|
|
359
403
|
# begin processing the document
|
|
360
|
-
|
|
361
|
-
response = self.client.post(
|
|
362
|
-
f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
|
|
363
|
-
)
|
|
404
|
+
doc = Document(self.client, create_json)
|
|
364
405
|
|
|
365
|
-
|
|
406
|
+
# begin processing
|
|
407
|
+
doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
|
|
408
|
+
|
|
409
|
+
return doc
|
|
366
410
|
|
|
367
411
|
def _collect_files(self, path, extensions):
|
|
368
412
|
"""Find the paths to files with specified extensions under a directory"""
|
|
@@ -379,171 +423,98 @@ class DocumentClient(BaseAPIClient):
|
|
|
379
423
|
|
|
380
424
|
def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
|
|
381
425
|
"""Upload files with specified extensions in a directory"""
|
|
382
|
-
# pylint:
|
|
383
|
-
|
|
384
|
-
# Do not set the same title for all documents
|
|
426
|
+
# pylint:disable=too-many-locals
|
|
385
427
|
kwargs.pop("title", None)
|
|
386
428
|
|
|
387
|
-
# If extensions are specified as None, it will check for all supported
|
|
388
|
-
# filetypes.
|
|
389
429
|
if extensions is None:
|
|
390
430
|
extensions = SUPPORTED_EXTENSIONS
|
|
391
|
-
|
|
392
|
-
# Convert single extension to a list if provided
|
|
393
431
|
if extensions and not isinstance(extensions, list):
|
|
394
432
|
extensions = [extensions]
|
|
395
|
-
|
|
396
|
-
# Checks to see if the extensions are supported, raises an error if not.
|
|
397
433
|
invalid_extensions = set(extensions) - set(SUPPORTED_EXTENSIONS)
|
|
398
434
|
if invalid_extensions:
|
|
399
435
|
raise ValueError(
|
|
400
436
|
f"Invalid extensions provided: {', '.join(invalid_extensions)}"
|
|
401
437
|
)
|
|
402
438
|
|
|
403
|
-
# Loop through the path and get all the files with matching extensions
|
|
404
439
|
path_list = self._collect_files(path, extensions)
|
|
405
|
-
|
|
406
440
|
logger.info(
|
|
407
|
-
"Upload directory on %s: Found %d files to upload",
|
|
408
|
-
path,
|
|
409
|
-
len(path_list)
|
|
441
|
+
"Upload directory on %s: Found %d files to upload", path, len(path_list)
|
|
410
442
|
)
|
|
411
443
|
|
|
412
|
-
# Upload all the files using the bulk API to reduce the number
|
|
413
|
-
# of API calls and improve performance
|
|
414
444
|
obj_list = []
|
|
445
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
415
446
|
params = self._format_upload_parameters("", **kwargs)
|
|
447
|
+
|
|
416
448
|
for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
|
|
417
|
-
# Grouper will put None's on the end of the last group
|
|
418
449
|
file_paths = [p for p in file_paths if p is not None]
|
|
419
|
-
|
|
420
450
|
logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
|
|
421
451
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
json=[
|
|
428
|
-
merge_dicts(
|
|
429
|
-
params,
|
|
430
|
-
{
|
|
431
|
-
"title": self._get_title(p),
|
|
432
|
-
"original_extension": os.path.splitext(
|
|
433
|
-
os.path.basename(p)
|
|
434
|
-
)[1]
|
|
435
|
-
.lower()
|
|
436
|
-
.lstrip("."),
|
|
437
|
-
},
|
|
438
|
-
)
|
|
439
|
-
for p in file_paths
|
|
440
|
-
],
|
|
441
|
-
)
|
|
442
|
-
except (APIError, RequestException) as exc:
|
|
443
|
-
if handle_errors:
|
|
444
|
-
logger.info(
|
|
445
|
-
"Error creating the following documents: %s\n%s",
|
|
446
|
-
exc,
|
|
447
|
-
"\n".join(file_paths)
|
|
448
|
-
)
|
|
449
|
-
continue
|
|
450
|
-
else:
|
|
451
|
-
raise
|
|
452
|
+
create_json = self._create_documents(file_paths, params, handle_errors)
|
|
453
|
+
sorted_create_json = sorted(create_json, key=lambda j: j["title"])
|
|
454
|
+
sorted_file_paths = sorted(file_paths, key=self._get_title)
|
|
455
|
+
obj_list.extend(sorted_create_json)
|
|
456
|
+
presigned_urls = [j["presigned_url"] for j in sorted_create_json]
|
|
452
457
|
|
|
453
|
-
|
|
454
|
-
create_json
|
|
455
|
-
obj_list.extend(create_json)
|
|
456
|
-
presigned_urls = [j["presigned_url"] for j in create_json]
|
|
457
|
-
for url, file_path in zip(presigned_urls, file_paths):
|
|
458
|
-
logger.info("Uploading %s to S3...", file_path)
|
|
459
|
-
try:
|
|
460
|
-
with open(file_path, "rb") as file:
|
|
461
|
-
response = requests_retry_session().put(url, data=file.read())
|
|
462
|
-
self.client.raise_for_status(response)
|
|
463
|
-
except (APIError, RequestException) as exc:
|
|
464
|
-
if handle_errors:
|
|
465
|
-
logger.info(
|
|
466
|
-
"Error uploading the following document: %s %s",
|
|
467
|
-
exc,
|
|
468
|
-
file_path
|
|
469
|
-
)
|
|
470
|
-
continue
|
|
471
|
-
else:
|
|
472
|
-
raise
|
|
473
|
-
|
|
474
|
-
# Begin processing the documents
|
|
475
|
-
logger.info("Processing the documents...")
|
|
476
|
-
doc_ids = [j["id"] for j in create_json]
|
|
477
|
-
try:
|
|
478
|
-
response = self.client.post("documents/process/", json={"ids": doc_ids})
|
|
479
|
-
except (APIError, RequestException) as exc:
|
|
480
|
-
if handle_errors:
|
|
481
|
-
logger.info(
|
|
482
|
-
"Error creating the following documents: %s\n%s",
|
|
483
|
-
exc,
|
|
484
|
-
"\n".join(file_paths)
|
|
485
|
-
)
|
|
486
|
-
continue
|
|
487
|
-
else:
|
|
488
|
-
raise
|
|
458
|
+
self._upload_files_to_s3(sorted_file_paths, presigned_urls, handle_errors)
|
|
459
|
+
self._process_documents(create_json, force_ocr, ocr_engine, handle_errors)
|
|
489
460
|
|
|
490
461
|
logger.info("Upload directory complete")
|
|
491
|
-
|
|
492
|
-
# Pass back the list of documents
|
|
493
462
|
return [Document(self.client, d) for d in obj_list]
|
|
494
463
|
|
|
495
|
-
def
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
url_group = [url for url in url_group if url is not None]
|
|
506
|
-
|
|
507
|
-
logger.info(
|
|
508
|
-
"Uploading group %d: %s",
|
|
509
|
-
i + 1,
|
|
510
|
-
"\n".join(url_group)
|
|
464
|
+
def _create_documents(self, file_paths, params, handle_errors):
|
|
465
|
+
body = [
|
|
466
|
+
merge_dicts(
|
|
467
|
+
params,
|
|
468
|
+
{
|
|
469
|
+
"title": self._get_title(p),
|
|
470
|
+
"original_extension": os.path.splitext(os.path.basename(p))[1]
|
|
471
|
+
.lower()
|
|
472
|
+
.lstrip("."),
|
|
473
|
+
},
|
|
511
474
|
)
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
"title": self._get_title(url),
|
|
523
|
-
"file_url": url,
|
|
524
|
-
},
|
|
525
|
-
)
|
|
526
|
-
for url in url_group
|
|
527
|
-
],
|
|
475
|
+
for p in sorted(file_paths)
|
|
476
|
+
]
|
|
477
|
+
try:
|
|
478
|
+
response = self.client.post("documents/", json=body)
|
|
479
|
+
except (APIError, RequestException) as exc:
|
|
480
|
+
if handle_errors:
|
|
481
|
+
logger.info(
|
|
482
|
+
"Error creating the following documents: %s\n%s",
|
|
483
|
+
exc,
|
|
484
|
+
"\n".join(file_paths),
|
|
528
485
|
)
|
|
486
|
+
return []
|
|
487
|
+
else:
|
|
488
|
+
raise
|
|
489
|
+
return response.json()
|
|
490
|
+
|
|
491
|
+
def _upload_files_to_s3(self, file_paths, presigned_urls, handle_errors):
|
|
492
|
+
for url, file_path in zip(presigned_urls, file_paths):
|
|
493
|
+
logger.info("Uploading %s to S3...", file_path)
|
|
494
|
+
try:
|
|
495
|
+
with open(file_path, "rb") as f:
|
|
496
|
+
response = requests_retry_session().put(url, data=f.read())
|
|
497
|
+
self.client.raise_for_status(response)
|
|
529
498
|
except (APIError, RequestException) as exc:
|
|
530
499
|
if handle_errors:
|
|
531
500
|
logger.info(
|
|
532
|
-
"Error
|
|
533
|
-
str(exc),
|
|
534
|
-
"\n".join(url_group)
|
|
501
|
+
"Error uploading the following document: %s %s", exc, file_path
|
|
535
502
|
)
|
|
536
|
-
continue
|
|
537
503
|
else:
|
|
538
504
|
raise
|
|
539
505
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
506
|
+
def _process_documents(self, create_json, force_ocr, ocr_engine, handle_errors):
|
|
507
|
+
payload = [
|
|
508
|
+
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
|
|
509
|
+
for j in create_json
|
|
510
|
+
]
|
|
511
|
+
try:
|
|
512
|
+
self.client.post("documents/process/", json=payload)
|
|
513
|
+
except (APIError, RequestException) as exc:
|
|
514
|
+
if handle_errors:
|
|
515
|
+
logger.info("Error processing documents: %s", exc)
|
|
516
|
+
else:
|
|
517
|
+
raise
|
|
547
518
|
|
|
548
519
|
|
|
549
520
|
class Mention:
|
|
@@ -2,17 +2,17 @@ documentcloud/__init__.py,sha256=XAwOR6JYL-flQV_uC616AMA2rYiXTkeogNolqE6LzN4,220
|
|
|
2
2
|
documentcloud/addon.py,sha256=3FxQjm26jknjLdd-GuztiZO4Z7NcgXq4WqunE9oh2es,11754
|
|
3
3
|
documentcloud/annotations.py,sha256=wVe3wYzyTRvc_hJ3r0m6iyDf6WIFlaGcCnyah_r53pg,2538
|
|
4
4
|
documentcloud/base.py,sha256=pNF45aleYpQ9fj75CiL3c4Ssv6MO1EmdzZ6wBLPKHDg,6545
|
|
5
|
-
documentcloud/client.py,sha256=
|
|
5
|
+
documentcloud/client.py,sha256=WXHNE1BT-LE2E55XlOvPuWl_g5N0zUIdXvB7Qj_fMNc,1658
|
|
6
6
|
documentcloud/constants.py,sha256=h6NStSkxPrjQ2gzaIlqftCF7tthkRimddOE8SsmlHag,1828
|
|
7
|
-
documentcloud/documents.py,sha256=
|
|
7
|
+
documentcloud/documents.py,sha256=dgoUr2XsxYmxC1xv3lJHgFQdJyE_rBNa2QS0Mn5Y2Is,18294
|
|
8
8
|
documentcloud/exceptions.py,sha256=AwIJpcylq6sF6oaenrZE6nr2EBuj23nxTOf3z_RwtuE,461
|
|
9
9
|
documentcloud/organizations.py,sha256=_Ot6MWzoa5JdU3jqedU-0Fec_K8WrgxqdlIp4oIijes,392
|
|
10
10
|
documentcloud/projects.py,sha256=KuOiw65a-8fdgbjo7BqjbEbWguds8inkhFJZJd578bs,5328
|
|
11
11
|
documentcloud/sections.py,sha256=cMf973KMvp6fAPSMXCD67L32Pz1_Tfh81oV2q2UQ9Uk,924
|
|
12
12
|
documentcloud/toolbox.py,sha256=zFZTyOn40YZjBpqa1H3qjpR4C3Wu1X2g72AvH_ljlic,1835
|
|
13
13
|
documentcloud/users.py,sha256=yydOXoEsfJlYqryZpXQ4G3aeRc5y_QCHqXd0dfF1aIc,354
|
|
14
|
-
python_documentcloud-4.
|
|
15
|
-
python_documentcloud-4.
|
|
16
|
-
python_documentcloud-4.
|
|
17
|
-
python_documentcloud-4.
|
|
18
|
-
python_documentcloud-4.
|
|
14
|
+
python_documentcloud-4.5.0.dist-info/licenses/LICENSE,sha256=Z1IBhHCzIeGR9F2iHtcLt2I2qoUhJ2pK139CAIAuFgo,1151
|
|
15
|
+
python_documentcloud-4.5.0.dist-info/METADATA,sha256=90GM8QOJIaQfjjZ_KKyxqvkQr1rxKg6IpCnAI_FZS1I,2880
|
|
16
|
+
python_documentcloud-4.5.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
|
|
17
|
+
python_documentcloud-4.5.0.dist-info/top_level.txt,sha256=rzNW2vA9GqU5ipNQYSP1XJQ54ippjKXVIo9oMlM0Tm4,14
|
|
18
|
+
python_documentcloud-4.5.0.dist-info/RECORD,,
|
{python_documentcloud-4.4.1.dist-info → python_documentcloud-4.5.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|