python-documentcloud 4.4.0__py2.py3-none-any.whl → 4.5.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documentcloud/documents.py +111 -134
- {python_documentcloud-4.4.0.dist-info → python_documentcloud-4.5.0.dist-info}/METADATA +3 -2
- {python_documentcloud-4.4.0.dist-info → python_documentcloud-4.5.0.dist-info}/RECORD +6 -6
- {python_documentcloud-4.4.0.dist-info → python_documentcloud-4.5.0.dist-info}/WHEEL +1 -1
- {python_documentcloud-4.4.0.dist-info → python_documentcloud-4.5.0.dist-info/licenses}/LICENSE +0 -0
- {python_documentcloud-4.4.0.dist-info → python_documentcloud-4.5.0.dist-info}/top_level.txt +0 -0
documentcloud/documents.py
CHANGED
|
@@ -74,8 +74,11 @@ class Document(BaseAPIObject):
|
|
|
74
74
|
def __getattr__(self, attr):
|
|
75
75
|
"""Generate methods for fetching resources"""
|
|
76
76
|
p_image = re.compile(
|
|
77
|
-
r"^get_
|
|
77
|
+
r"^get_"
|
|
78
|
+
r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
|
|
79
|
+
r"(?P<list>_list)?$"
|
|
78
80
|
)
|
|
81
|
+
|
|
79
82
|
get = attr.startswith("get_")
|
|
80
83
|
url = attr.endswith("_url")
|
|
81
84
|
text = attr.endswith("_text")
|
|
@@ -230,9 +233,15 @@ class Document(BaseAPIObject):
|
|
|
230
233
|
|
|
231
234
|
return all_results
|
|
232
235
|
|
|
233
|
-
def process(self):
|
|
234
|
-
"""
|
|
235
|
-
|
|
236
|
+
def process(self, **kwargs):
|
|
237
|
+
"""Process the document, used on upload and for reprocessing"""
|
|
238
|
+
payload = {}
|
|
239
|
+
if "force_ocr" in kwargs:
|
|
240
|
+
payload["force_ocr"] = kwargs["force_ocr"]
|
|
241
|
+
if "ocr_engine" in kwargs:
|
|
242
|
+
payload["ocr_engine"] = kwargs["ocr_engine"]
|
|
243
|
+
|
|
244
|
+
self._client.post(f"{self.api_path}/{self.id}/process/", json=payload)
|
|
236
245
|
|
|
237
246
|
|
|
238
247
|
class DocumentClient(BaseAPIClient):
|
|
@@ -310,6 +319,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
310
319
|
"title",
|
|
311
320
|
"data",
|
|
312
321
|
"force_ocr",
|
|
322
|
+
"ocr_engine",
|
|
313
323
|
"projects",
|
|
314
324
|
"delayed_index",
|
|
315
325
|
"revision_control",
|
|
@@ -333,21 +343,55 @@ class DocumentClient(BaseAPIClient):
|
|
|
333
343
|
|
|
334
344
|
return params
|
|
335
345
|
|
|
346
|
+
def _extract_ocr_options(self, kwargs):
|
|
347
|
+
"""
|
|
348
|
+
Extract and validate OCR options from kwargs.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
force_ocr (bool)
|
|
352
|
+
ocr_engine (str)
|
|
353
|
+
"""
|
|
354
|
+
force_ocr = kwargs.pop("force_ocr", False)
|
|
355
|
+
ocr_engine = kwargs.pop("ocr_engine", "tess4")
|
|
356
|
+
|
|
357
|
+
if not isinstance(force_ocr, bool):
|
|
358
|
+
raise ValueError("force_ocr must be a boolean")
|
|
359
|
+
|
|
360
|
+
if ocr_engine and ocr_engine not in ("tess4", "textract"):
|
|
361
|
+
raise ValueError(
|
|
362
|
+
"ocr_engine must be either 'tess4' for tesseract or 'textract'"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
return force_ocr, ocr_engine
|
|
366
|
+
|
|
336
367
|
def _get_title(self, name):
|
|
337
368
|
"""Get the default title for a document from its path"""
|
|
338
369
|
return name.split(os.sep)[-1].rsplit(".", 1)[0]
|
|
339
370
|
|
|
340
371
|
def _upload_url(self, file_url, **kwargs):
|
|
341
372
|
"""Upload a document from a publicly accessible URL"""
|
|
373
|
+
# extract process-related args
|
|
374
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
375
|
+
|
|
376
|
+
# create the document
|
|
342
377
|
params = self._format_upload_parameters(file_url, **kwargs)
|
|
343
378
|
params["file_url"] = file_url
|
|
379
|
+
if force_ocr:
|
|
380
|
+
params["force_ocr"] = force_ocr
|
|
381
|
+
params["ocr_engine"] = ocr_engine
|
|
344
382
|
response = self.client.post("documents/", json=params)
|
|
345
|
-
|
|
383
|
+
create_json = response.json()
|
|
384
|
+
|
|
385
|
+
# wrap in Document object
|
|
386
|
+
doc = Document(self.client, create_json)
|
|
387
|
+
|
|
388
|
+
return doc
|
|
346
389
|
|
|
347
390
|
def _upload_file(self, file_, **kwargs):
|
|
348
391
|
"""Upload a document directly"""
|
|
349
392
|
# create the document
|
|
350
|
-
force_ocr =
|
|
393
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
394
|
+
|
|
351
395
|
params = self._format_upload_parameters(file_.name, **kwargs)
|
|
352
396
|
response = self.client.post("documents/", json=params)
|
|
353
397
|
|
|
@@ -357,12 +401,12 @@ class DocumentClient(BaseAPIClient):
|
|
|
357
401
|
response = requests_retry_session().put(presigned_url, data=file_.read())
|
|
358
402
|
|
|
359
403
|
# begin processing the document
|
|
360
|
-
|
|
361
|
-
response = self.client.post(
|
|
362
|
-
f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
|
|
363
|
-
)
|
|
404
|
+
doc = Document(self.client, create_json)
|
|
364
405
|
|
|
365
|
-
|
|
406
|
+
# begin processing
|
|
407
|
+
doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
|
|
408
|
+
|
|
409
|
+
return doc
|
|
366
410
|
|
|
367
411
|
def _collect_files(self, path, extensions):
|
|
368
412
|
"""Find the paths to files with specified extensions under a directory"""
|
|
@@ -379,165 +423,98 @@ class DocumentClient(BaseAPIClient):
|
|
|
379
423
|
|
|
380
424
|
def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
|
|
381
425
|
"""Upload files with specified extensions in a directory"""
|
|
382
|
-
# pylint:
|
|
383
|
-
|
|
384
|
-
# Do not set the same title for all documents
|
|
426
|
+
# pylint:disable=too-many-locals
|
|
385
427
|
kwargs.pop("title", None)
|
|
386
428
|
|
|
387
|
-
# If extensions are specified as None, it will check for all supported
|
|
388
|
-
# filetypes.
|
|
389
429
|
if extensions is None:
|
|
390
430
|
extensions = SUPPORTED_EXTENSIONS
|
|
391
|
-
|
|
392
|
-
# Convert single extension to a list if provided
|
|
393
431
|
if extensions and not isinstance(extensions, list):
|
|
394
432
|
extensions = [extensions]
|
|
395
|
-
|
|
396
|
-
# Checks to see if the extensions are supported, raises an error if not.
|
|
397
433
|
invalid_extensions = set(extensions) - set(SUPPORTED_EXTENSIONS)
|
|
398
434
|
if invalid_extensions:
|
|
399
435
|
raise ValueError(
|
|
400
436
|
f"Invalid extensions provided: {', '.join(invalid_extensions)}"
|
|
401
437
|
)
|
|
402
438
|
|
|
403
|
-
# Loop through the path and get all the files with matching extensions
|
|
404
439
|
path_list = self._collect_files(path, extensions)
|
|
405
|
-
|
|
406
440
|
logger.info(
|
|
407
441
|
"Upload directory on %s: Found %d files to upload", path, len(path_list)
|
|
408
442
|
)
|
|
409
443
|
|
|
410
|
-
# Upload all the files using the bulk API to reduce the number
|
|
411
|
-
# of API calls and improve performance
|
|
412
444
|
obj_list = []
|
|
445
|
+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
|
|
413
446
|
params = self._format_upload_parameters("", **kwargs)
|
|
447
|
+
|
|
414
448
|
for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
|
|
415
|
-
# Grouper will put None's on the end of the last group
|
|
416
449
|
file_paths = [p for p in file_paths if p is not None]
|
|
417
|
-
|
|
418
450
|
logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
|
|
419
451
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
json=[
|
|
426
|
-
merge_dicts(
|
|
427
|
-
params,
|
|
428
|
-
{
|
|
429
|
-
"title": self._get_title(p),
|
|
430
|
-
"original_extension": os.path.splitext(
|
|
431
|
-
os.path.basename(p)
|
|
432
|
-
)[1]
|
|
433
|
-
.lower()
|
|
434
|
-
.lstrip("."),
|
|
435
|
-
},
|
|
436
|
-
)
|
|
437
|
-
for p in file_paths
|
|
438
|
-
],
|
|
439
|
-
)
|
|
440
|
-
except (APIError, RequestException) as exc:
|
|
441
|
-
if handle_errors:
|
|
442
|
-
logger.info(
|
|
443
|
-
"Error creating the following documents: %s\n%s",
|
|
444
|
-
exc,
|
|
445
|
-
"\n".join(file_paths),
|
|
446
|
-
)
|
|
447
|
-
continue
|
|
448
|
-
else:
|
|
449
|
-
raise
|
|
452
|
+
create_json = self._create_documents(file_paths, params, handle_errors)
|
|
453
|
+
sorted_create_json = sorted(create_json, key=lambda j: j["title"])
|
|
454
|
+
sorted_file_paths = sorted(file_paths, key=self._get_title)
|
|
455
|
+
obj_list.extend(sorted_create_json)
|
|
456
|
+
presigned_urls = [j["presigned_url"] for j in sorted_create_json]
|
|
450
457
|
|
|
451
|
-
|
|
452
|
-
create_json
|
|
453
|
-
obj_list.extend(create_json)
|
|
454
|
-
presigned_urls = [j["presigned_url"] for j in create_json]
|
|
455
|
-
for url, file_path in zip(presigned_urls, file_paths):
|
|
456
|
-
logger.info("Uploading %s to S3...", file_path)
|
|
457
|
-
try:
|
|
458
|
-
with open(file_path, "rb") as file:
|
|
459
|
-
response = requests_retry_session().put(url, data=file.read())
|
|
460
|
-
self.client.raise_for_status(response)
|
|
461
|
-
except (APIError, RequestException) as exc:
|
|
462
|
-
if handle_errors:
|
|
463
|
-
logger.info(
|
|
464
|
-
"Error uploading the following document: %s %s",
|
|
465
|
-
exc,
|
|
466
|
-
file_path,
|
|
467
|
-
)
|
|
468
|
-
continue
|
|
469
|
-
else:
|
|
470
|
-
raise
|
|
471
|
-
|
|
472
|
-
# Begin processing the documents
|
|
473
|
-
logger.info("Processing the documents...")
|
|
474
|
-
doc_ids = [j["id"] for j in create_json]
|
|
475
|
-
try:
|
|
476
|
-
response = self.client.post("documents/process/", json={"ids": doc_ids})
|
|
477
|
-
except (APIError, RequestException) as exc:
|
|
478
|
-
if handle_errors:
|
|
479
|
-
logger.info(
|
|
480
|
-
"Error creating the following documents: %s\n%s",
|
|
481
|
-
exc,
|
|
482
|
-
"\n".join(file_paths),
|
|
483
|
-
)
|
|
484
|
-
continue
|
|
485
|
-
else:
|
|
486
|
-
raise
|
|
458
|
+
self._upload_files_to_s3(sorted_file_paths, presigned_urls, handle_errors)
|
|
459
|
+
self._process_documents(create_json, force_ocr, ocr_engine, handle_errors)
|
|
487
460
|
|
|
488
461
|
logger.info("Upload directory complete")
|
|
489
|
-
|
|
490
|
-
# Pass back the list of documents
|
|
491
462
|
return [Document(self.client, d) for d in obj_list]
|
|
492
463
|
|
|
493
|
-
def
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
"documents
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
params,
|
|
515
|
-
{
|
|
516
|
-
"title": self._get_title(url),
|
|
517
|
-
"file_url": url,
|
|
518
|
-
},
|
|
519
|
-
)
|
|
520
|
-
for url in url_group
|
|
521
|
-
],
|
|
464
|
+
def _create_documents(self, file_paths, params, handle_errors):
|
|
465
|
+
body = [
|
|
466
|
+
merge_dicts(
|
|
467
|
+
params,
|
|
468
|
+
{
|
|
469
|
+
"title": self._get_title(p),
|
|
470
|
+
"original_extension": os.path.splitext(os.path.basename(p))[1]
|
|
471
|
+
.lower()
|
|
472
|
+
.lstrip("."),
|
|
473
|
+
},
|
|
474
|
+
)
|
|
475
|
+
for p in sorted(file_paths)
|
|
476
|
+
]
|
|
477
|
+
try:
|
|
478
|
+
response = self.client.post("documents/", json=body)
|
|
479
|
+
except (APIError, RequestException) as exc:
|
|
480
|
+
if handle_errors:
|
|
481
|
+
logger.info(
|
|
482
|
+
"Error creating the following documents: %s\n%s",
|
|
483
|
+
exc,
|
|
484
|
+
"\n".join(file_paths),
|
|
522
485
|
)
|
|
486
|
+
return []
|
|
487
|
+
else:
|
|
488
|
+
raise
|
|
489
|
+
return response.json()
|
|
490
|
+
|
|
491
|
+
def _upload_files_to_s3(self, file_paths, presigned_urls, handle_errors):
|
|
492
|
+
for url, file_path in zip(presigned_urls, file_paths):
|
|
493
|
+
logger.info("Uploading %s to S3...", file_path)
|
|
494
|
+
try:
|
|
495
|
+
with open(file_path, "rb") as f:
|
|
496
|
+
response = requests_retry_session().put(url, data=f.read())
|
|
497
|
+
self.client.raise_for_status(response)
|
|
523
498
|
except (APIError, RequestException) as exc:
|
|
524
499
|
if handle_errors:
|
|
525
500
|
logger.info(
|
|
526
|
-
"Error
|
|
527
|
-
str(exc),
|
|
528
|
-
"\n".join(url_group),
|
|
501
|
+
"Error uploading the following document: %s %s", exc, file_path
|
|
529
502
|
)
|
|
530
|
-
continue
|
|
531
503
|
else:
|
|
532
504
|
raise
|
|
533
505
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
506
|
+
def _process_documents(self, create_json, force_ocr, ocr_engine, handle_errors):
|
|
507
|
+
payload = [
|
|
508
|
+
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
|
|
509
|
+
for j in create_json
|
|
510
|
+
]
|
|
511
|
+
try:
|
|
512
|
+
self.client.post("documents/process/", json=payload)
|
|
513
|
+
except (APIError, RequestException) as exc:
|
|
514
|
+
if handle_errors:
|
|
515
|
+
logger.info("Error processing documents: %s", exc)
|
|
516
|
+
else:
|
|
517
|
+
raise
|
|
541
518
|
|
|
542
519
|
|
|
543
520
|
class Mention:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: python-documentcloud
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.5.0
|
|
4
4
|
Summary: A simple Python wrapper for the DocumentCloud API
|
|
5
5
|
Home-page: https://github.com/muckrock/python-documentcloud
|
|
6
6
|
Author: Mitchell Kotler
|
|
@@ -46,6 +46,7 @@ Dynamic: description
|
|
|
46
46
|
Dynamic: description-content-type
|
|
47
47
|
Dynamic: home-page
|
|
48
48
|
Dynamic: license
|
|
49
|
+
Dynamic: license-file
|
|
49
50
|
Dynamic: provides-extra
|
|
50
51
|
Dynamic: requires-dist
|
|
51
52
|
Dynamic: summary
|
|
@@ -4,15 +4,15 @@ documentcloud/annotations.py,sha256=wVe3wYzyTRvc_hJ3r0m6iyDf6WIFlaGcCnyah_r53pg,
|
|
|
4
4
|
documentcloud/base.py,sha256=pNF45aleYpQ9fj75CiL3c4Ssv6MO1EmdzZ6wBLPKHDg,6545
|
|
5
5
|
documentcloud/client.py,sha256=WXHNE1BT-LE2E55XlOvPuWl_g5N0zUIdXvB7Qj_fMNc,1658
|
|
6
6
|
documentcloud/constants.py,sha256=h6NStSkxPrjQ2gzaIlqftCF7tthkRimddOE8SsmlHag,1828
|
|
7
|
-
documentcloud/documents.py,sha256=
|
|
7
|
+
documentcloud/documents.py,sha256=dgoUr2XsxYmxC1xv3lJHgFQdJyE_rBNa2QS0Mn5Y2Is,18294
|
|
8
8
|
documentcloud/exceptions.py,sha256=AwIJpcylq6sF6oaenrZE6nr2EBuj23nxTOf3z_RwtuE,461
|
|
9
9
|
documentcloud/organizations.py,sha256=_Ot6MWzoa5JdU3jqedU-0Fec_K8WrgxqdlIp4oIijes,392
|
|
10
10
|
documentcloud/projects.py,sha256=KuOiw65a-8fdgbjo7BqjbEbWguds8inkhFJZJd578bs,5328
|
|
11
11
|
documentcloud/sections.py,sha256=cMf973KMvp6fAPSMXCD67L32Pz1_Tfh81oV2q2UQ9Uk,924
|
|
12
12
|
documentcloud/toolbox.py,sha256=zFZTyOn40YZjBpqa1H3qjpR4C3Wu1X2g72AvH_ljlic,1835
|
|
13
13
|
documentcloud/users.py,sha256=yydOXoEsfJlYqryZpXQ4G3aeRc5y_QCHqXd0dfF1aIc,354
|
|
14
|
-
python_documentcloud-4.
|
|
15
|
-
python_documentcloud-4.
|
|
16
|
-
python_documentcloud-4.
|
|
17
|
-
python_documentcloud-4.
|
|
18
|
-
python_documentcloud-4.
|
|
14
|
+
python_documentcloud-4.5.0.dist-info/licenses/LICENSE,sha256=Z1IBhHCzIeGR9F2iHtcLt2I2qoUhJ2pK139CAIAuFgo,1151
|
|
15
|
+
python_documentcloud-4.5.0.dist-info/METADATA,sha256=90GM8QOJIaQfjjZ_KKyxqvkQr1rxKg6IpCnAI_FZS1I,2880
|
|
16
|
+
python_documentcloud-4.5.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
|
|
17
|
+
python_documentcloud-4.5.0.dist-info/top_level.txt,sha256=rzNW2vA9GqU5ipNQYSP1XJQ54ippjKXVIo9oMlM0Tm4,14
|
|
18
|
+
python_documentcloud-4.5.0.dist-info/RECORD,,
|
{python_documentcloud-4.4.0.dist-info → python_documentcloud-4.5.0.dist-info/licenses}/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|