python-documentcloud 4.4.1__tar.gz → 4.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/PKG-INFO +1 -1
  2. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/client.py +4 -8
  3. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/documents.py +111 -140
  4. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/python_documentcloud.egg-info/PKG-INFO +1 -1
  5. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/setup.py +1 -1
  6. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_documents.py +1 -4
  7. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/LICENSE +0 -0
  8. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/README.md +0 -0
  9. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/__init__.py +0 -0
  10. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/addon.py +0 -0
  11. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/annotations.py +0 -0
  12. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/base.py +0 -0
  13. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/constants.py +0 -0
  14. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/exceptions.py +0 -0
  15. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/organizations.py +0 -0
  16. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/projects.py +0 -0
  17. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/sections.py +0 -0
  18. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/toolbox.py +0 -0
  19. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/documentcloud/users.py +0 -0
  20. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/python_documentcloud.egg-info/SOURCES.txt +0 -0
  21. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/python_documentcloud.egg-info/dependency_links.txt +0 -0
  22. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/python_documentcloud.egg-info/requires.txt +0 -0
  23. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/python_documentcloud.egg-info/top_level.txt +0 -0
  24. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/setup.cfg +0 -0
  25. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_annotations.py +0 -0
  26. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_base.py +0 -0
  27. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_client.py +0 -0
  28. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_organizations.py +0 -0
  29. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_projects.py +0 -0
  30. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_sections.py +0 -0
  31. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_toolbox.py +0 -0
  32. {python_documentcloud-4.4.1 → python_documentcloud-4.5.0}/tests/test_users.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-documentcloud
3
- Version: 4.4.1
3
+ Version: 4.5.0
4
4
  Summary: A simple Python wrapper for the DocumentCloud API
5
5
  Home-page: https://github.com/muckrock/python-documentcloud
6
6
  Author: Mitchell Kotler
@@ -14,11 +14,12 @@ from .users import UserClient
14
14
 
15
15
  logger = logging.getLogger("documentcloud")
16
16
 
17
+
17
18
  class DocumentCloud(SquareletClient):
18
19
  """
19
20
  The public interface for the DocumentCloud API, now integrated with SquareletClient
20
21
  """
21
- # pylint:disable=too-many-positional-arguments
22
+
22
23
  def __init__(
23
24
  self,
24
25
  username=None,
@@ -30,7 +31,7 @@ class DocumentCloud(SquareletClient):
30
31
  rate_limit=True,
31
32
  rate_limit_sleep=True,
32
33
  ):
33
- # Initialize SquareletClient for authentication and request handling
34
+ # Initialize SquareletClient for authentication and request handling
34
35
  super().__init__(
35
36
  base_uri=base_uri,
36
37
  username=username,
@@ -38,7 +39,7 @@ class DocumentCloud(SquareletClient):
38
39
  auth_uri=auth_uri,
39
40
  timeout=timeout,
40
41
  rate_limit=rate_limit,
41
- rate_limit_sleep=rate_limit_sleep
42
+ rate_limit_sleep=rate_limit_sleep,
42
43
  )
43
44
 
44
45
  # Set up logging
@@ -55,8 +56,3 @@ class DocumentCloud(SquareletClient):
55
56
  self.projects = ProjectClient(self)
56
57
  self.users = UserClient(self)
57
58
  self.organizations = OrganizationClient(self)
58
-
59
- """def _request(self, method, url, raise_error=True, **kwargs):
60
- Delegates request to the SquareletClient's _request method
61
- return self.squarelet_client.request(method, url, raise_error, **kwargs)
62
- """
@@ -74,8 +74,11 @@ class Document(BaseAPIObject):
74
74
  def __getattr__(self, attr):
75
75
  """Generate methods for fetching resources"""
76
76
  p_image = re.compile(
77
- r"^get_(?P<size>thumbnail|small|normal|large|xlarge)_image_url(?P<list>_list)?$"
77
+ r"^get_"
78
+ r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
79
+ r"(?P<list>_list)?$"
78
80
  )
81
+
79
82
  get = attr.startswith("get_")
80
83
  url = attr.endswith("_url")
81
84
  text = attr.endswith("_text")
@@ -230,9 +233,15 @@ class Document(BaseAPIObject):
230
233
 
231
234
  return all_results
232
235
 
233
- def process(self):
234
- """Reprocess the document"""
235
- self._client.post(f"{self.api_path}/{self.id}/process/")
236
+ def process(self, **kwargs):
237
+ """Process the document, used on upload and for reprocessing"""
238
+ payload = {}
239
+ if "force_ocr" in kwargs:
240
+ payload["force_ocr"] = kwargs["force_ocr"]
241
+ if "ocr_engine" in kwargs:
242
+ payload["ocr_engine"] = kwargs["ocr_engine"]
243
+
244
+ self._client.post(f"{self.api_path}/{self.id}/process/", json=payload)
236
245
 
237
246
 
238
247
  class DocumentClient(BaseAPIClient):
@@ -310,6 +319,7 @@ class DocumentClient(BaseAPIClient):
310
319
  "title",
311
320
  "data",
312
321
  "force_ocr",
322
+ "ocr_engine",
313
323
  "projects",
314
324
  "delayed_index",
315
325
  "revision_control",
@@ -333,21 +343,55 @@ class DocumentClient(BaseAPIClient):
333
343
 
334
344
  return params
335
345
 
346
+ def _extract_ocr_options(self, kwargs):
347
+ """
348
+ Extract and validate OCR options from kwargs.
349
+
350
+ Returns:
351
+ force_ocr (bool)
352
+ ocr_engine (str)
353
+ """
354
+ force_ocr = kwargs.pop("force_ocr", False)
355
+ ocr_engine = kwargs.pop("ocr_engine", "tess4")
356
+
357
+ if not isinstance(force_ocr, bool):
358
+ raise ValueError("force_ocr must be a boolean")
359
+
360
+ if ocr_engine and ocr_engine not in ("tess4", "textract"):
361
+ raise ValueError(
362
+ "ocr_engine must be either 'tess4' for tesseract or 'textract'"
363
+ )
364
+
365
+ return force_ocr, ocr_engine
366
+
336
367
  def _get_title(self, name):
337
368
  """Get the default title for a document from its path"""
338
369
  return name.split(os.sep)[-1].rsplit(".", 1)[0]
339
370
 
340
371
  def _upload_url(self, file_url, **kwargs):
341
372
  """Upload a document from a publicly accessible URL"""
373
+ # extract process-related args
374
+ force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
375
+
376
+ # create the document
342
377
  params = self._format_upload_parameters(file_url, **kwargs)
343
378
  params["file_url"] = file_url
379
+ if force_ocr:
380
+ params["force_ocr"] = force_ocr
381
+ params["ocr_engine"] = ocr_engine
344
382
  response = self.client.post("documents/", json=params)
345
- return Document(self.client, response.json())
383
+ create_json = response.json()
384
+
385
+ # wrap in Document object
386
+ doc = Document(self.client, create_json)
387
+
388
+ return doc
346
389
 
347
390
  def _upload_file(self, file_, **kwargs):
348
391
  """Upload a document directly"""
349
392
  # create the document
350
- force_ocr = kwargs.pop("force_ocr", False)
393
+ force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
394
+
351
395
  params = self._format_upload_parameters(file_.name, **kwargs)
352
396
  response = self.client.post("documents/", json=params)
353
397
 
@@ -357,12 +401,12 @@ class DocumentClient(BaseAPIClient):
357
401
  response = requests_retry_session().put(presigned_url, data=file_.read())
358
402
 
359
403
  # begin processing the document
360
- doc_id = create_json["id"]
361
- response = self.client.post(
362
- f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
363
- )
404
+ doc = Document(self.client, create_json)
364
405
 
365
- return Document(self.client, create_json)
406
+ # begin processing
407
+ doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
408
+
409
+ return doc
366
410
 
367
411
  def _collect_files(self, path, extensions):
368
412
  """Find the paths to files with specified extensions under a directory"""
@@ -379,171 +423,98 @@ class DocumentClient(BaseAPIClient):
379
423
 
380
424
  def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
381
425
  """Upload files with specified extensions in a directory"""
382
- # pylint: disable=too-many-locals, too-many-branches
383
-
384
- # Do not set the same title for all documents
426
+ # pylint:disable=too-many-locals
385
427
  kwargs.pop("title", None)
386
428
 
387
- # If extensions are specified as None, it will check for all supported
388
- # filetypes.
389
429
  if extensions is None:
390
430
  extensions = SUPPORTED_EXTENSIONS
391
-
392
- # Convert single extension to a list if provided
393
431
  if extensions and not isinstance(extensions, list):
394
432
  extensions = [extensions]
395
-
396
- # Checks to see if the extensions are supported, raises an error if not.
397
433
  invalid_extensions = set(extensions) - set(SUPPORTED_EXTENSIONS)
398
434
  if invalid_extensions:
399
435
  raise ValueError(
400
436
  f"Invalid extensions provided: {', '.join(invalid_extensions)}"
401
437
  )
402
438
 
403
- # Loop through the path and get all the files with matching extensions
404
439
  path_list = self._collect_files(path, extensions)
405
-
406
440
  logger.info(
407
- "Upload directory on %s: Found %d files to upload",
408
- path,
409
- len(path_list)
441
+ "Upload directory on %s: Found %d files to upload", path, len(path_list)
410
442
  )
411
443
 
412
- # Upload all the files using the bulk API to reduce the number
413
- # of API calls and improve performance
414
444
  obj_list = []
445
+ force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
415
446
  params = self._format_upload_parameters("", **kwargs)
447
+
416
448
  for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
417
- # Grouper will put None's on the end of the last group
418
449
  file_paths = [p for p in file_paths if p is not None]
419
-
420
450
  logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
421
451
 
422
- # Create the documents
423
- logger.info("Creating the documents...")
424
- try:
425
- response = self.client.post(
426
- "documents/",
427
- json=[
428
- merge_dicts(
429
- params,
430
- {
431
- "title": self._get_title(p),
432
- "original_extension": os.path.splitext(
433
- os.path.basename(p)
434
- )[1]
435
- .lower()
436
- .lstrip("."),
437
- },
438
- )
439
- for p in file_paths
440
- ],
441
- )
442
- except (APIError, RequestException) as exc:
443
- if handle_errors:
444
- logger.info(
445
- "Error creating the following documents: %s\n%s",
446
- exc,
447
- "\n".join(file_paths)
448
- )
449
- continue
450
- else:
451
- raise
452
+ create_json = self._create_documents(file_paths, params, handle_errors)
453
+ sorted_create_json = sorted(create_json, key=lambda j: j["title"])
454
+ sorted_file_paths = sorted(file_paths, key=self._get_title)
455
+ obj_list.extend(sorted_create_json)
456
+ presigned_urls = [j["presigned_url"] for j in sorted_create_json]
452
457
 
453
- # Upload the files directly to storage
454
- create_json = response.json()
455
- obj_list.extend(create_json)
456
- presigned_urls = [j["presigned_url"] for j in create_json]
457
- for url, file_path in zip(presigned_urls, file_paths):
458
- logger.info("Uploading %s to S3...", file_path)
459
- try:
460
- with open(file_path, "rb") as file:
461
- response = requests_retry_session().put(url, data=file.read())
462
- self.client.raise_for_status(response)
463
- except (APIError, RequestException) as exc:
464
- if handle_errors:
465
- logger.info(
466
- "Error uploading the following document: %s %s",
467
- exc,
468
- file_path
469
- )
470
- continue
471
- else:
472
- raise
473
-
474
- # Begin processing the documents
475
- logger.info("Processing the documents...")
476
- doc_ids = [j["id"] for j in create_json]
477
- try:
478
- response = self.client.post("documents/process/", json={"ids": doc_ids})
479
- except (APIError, RequestException) as exc:
480
- if handle_errors:
481
- logger.info(
482
- "Error creating the following documents: %s\n%s",
483
- exc,
484
- "\n".join(file_paths)
485
- )
486
- continue
487
- else:
488
- raise
458
+ self._upload_files_to_s3(sorted_file_paths, presigned_urls, handle_errors)
459
+ self._process_documents(create_json, force_ocr, ocr_engine, handle_errors)
489
460
 
490
461
  logger.info("Upload directory complete")
491
-
492
- # Pass back the list of documents
493
462
  return [Document(self.client, d) for d in obj_list]
494
463
 
495
- def upload_urls(self, url_list, handle_errors=False, **kwargs):
496
- """Upload documents from a list of URLs"""
497
-
498
- # Do not set the same title for all documents
499
- kwargs.pop("title", None)
500
-
501
- obj_list = []
502
- params = self._format_upload_parameters("", **kwargs)
503
- for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
504
- # Grouper will put None's on the end of the last group
505
- url_group = [url for url in url_group if url is not None]
506
-
507
- logger.info(
508
- "Uploading group %d: %s",
509
- i + 1,
510
- "\n".join(url_group)
464
+ def _create_documents(self, file_paths, params, handle_errors):
465
+ body = [
466
+ merge_dicts(
467
+ params,
468
+ {
469
+ "title": self._get_title(p),
470
+ "original_extension": os.path.splitext(os.path.basename(p))[1]
471
+ .lower()
472
+ .lstrip("."),
473
+ },
511
474
  )
512
-
513
- # Create the documents
514
- logger.info("Creating the documents...")
515
- try:
516
- response = self.client.post(
517
- "documents/",
518
- json=[
519
- merge_dicts(
520
- params,
521
- {
522
- "title": self._get_title(url),
523
- "file_url": url,
524
- },
525
- )
526
- for url in url_group
527
- ],
475
+ for p in sorted(file_paths)
476
+ ]
477
+ try:
478
+ response = self.client.post("documents/", json=body)
479
+ except (APIError, RequestException) as exc:
480
+ if handle_errors:
481
+ logger.info(
482
+ "Error creating the following documents: %s\n%s",
483
+ exc,
484
+ "\n".join(file_paths),
528
485
  )
486
+ return []
487
+ else:
488
+ raise
489
+ return response.json()
490
+
491
+ def _upload_files_to_s3(self, file_paths, presigned_urls, handle_errors):
492
+ for url, file_path in zip(presigned_urls, file_paths):
493
+ logger.info("Uploading %s to S3...", file_path)
494
+ try:
495
+ with open(file_path, "rb") as f:
496
+ response = requests_retry_session().put(url, data=f.read())
497
+ self.client.raise_for_status(response)
529
498
  except (APIError, RequestException) as exc:
530
499
  if handle_errors:
531
500
  logger.info(
532
- "Error creating the following documents: %s\n%s",
533
- str(exc),
534
- "\n".join(url_group)
501
+ "Error uploading the following document: %s %s", exc, file_path
535
502
  )
536
- continue
537
503
  else:
538
504
  raise
539
505
 
540
- create_json = response.json()
541
- obj_list.extend(create_json)
542
-
543
- logger.info("Upload URLs complete")
544
-
545
- # Pass back the list of documents
546
- return [Document(self.client, d) for d in obj_list]
506
+ def _process_documents(self, create_json, force_ocr, ocr_engine, handle_errors):
507
+ payload = [
508
+ {"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
509
+ for j in create_json
510
+ ]
511
+ try:
512
+ self.client.post("documents/process/", json=payload)
513
+ except (APIError, RequestException) as exc:
514
+ if handle_errors:
515
+ logger.info("Error processing documents: %s", exc)
516
+ else:
517
+ raise
547
518
 
548
519
 
549
520
  class Mention:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-documentcloud
3
- Version: 4.4.1
3
+ Version: 4.5.0
4
4
  Summary: A simple Python wrapper for the DocumentCloud API
5
5
  Home-page: https://github.com/muckrock/python-documentcloud
6
6
  Author: Mitchell Kotler
@@ -7,7 +7,7 @@ with open("README.md", "r") as fh:
7
7
 
8
8
  setup(
9
9
  name="python-documentcloud",
10
- version="4.4.1",
10
+ version="4.5.0",
11
11
  description="A simple Python wrapper for the DocumentCloud API",
12
12
  author="Mitchell Kotler",
13
13
  author_email="mitch@muckrock.com",
@@ -158,9 +158,7 @@ class TestDocument:
158
158
 
159
159
  class TestDocumentClient:
160
160
  def test_search(self, client, document):
161
- documents = client.documents.search(
162
- f"document:{document.id} simple"
163
- )
161
+ documents = client.documents.search(f"document:{document.id} simple")
164
162
  assert documents
165
163
 
166
164
  def test_list(self, client):
@@ -182,7 +180,6 @@ class TestDocumentClient:
182
180
  document = document_factory(pdf)
183
181
  assert document.status == "success"
184
182
 
185
-
186
183
  def test_upload_file_path(self, document_factory):
187
184
  document = document_factory("tests/test.pdf")
188
185
  assert document.status == "success"