python-documentcloud 4.5.0__py2.py3-none-any.whl → 4.7.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
documentcloud/addon.py CHANGED
@@ -182,6 +182,26 @@ class AddOn(BaseAddOn):
182
182
  f"addon_runs/{self.id}/", json={"file_name": file_name}
183
183
  )
184
184
 
185
+ def load_run_data(self):
186
+ "Load persistent data from this run"
187
+ if not self.id:
188
+ return {}
189
+
190
+ response = self.client.get(f"addon_runs/{self.id}/")
191
+ response.raise_for_status()
192
+ return response.json().get("data", {})
193
+
194
+ def store_run_data(self, data):
195
+ "Store persistent data for this run"
196
+ if not self.id:
197
+ print("Run ID not set. Try again later or check if something went wrong.")
198
+ return None
199
+
200
+ if not isinstance(data, dict):
201
+ raise TypeError("Invalid data")
202
+
203
+ return self.client.patch(f"addon_runs/{self.id}/", json={"data": data})
204
+
185
205
  def load_event_data(self):
186
206
  """Load persistent data for this event"""
187
207
  if not self.event_id:
documentcloud/client.py CHANGED
@@ -1,12 +1,12 @@
1
- # Import SquareletClient from python-squarelet
2
1
  # Standard Library
3
2
  import logging
3
+ import time
4
4
 
5
5
  # Third Party
6
+ import token_bucket
6
7
  from squarelet import SquareletClient
7
8
 
8
9
  # Local
9
- # Local Imports
10
10
  from .documents import DocumentClient
11
11
  from .organizations import OrganizationClient
12
12
  from .projects import ProjectClient
@@ -14,6 +14,22 @@ from .users import UserClient
14
14
 
15
15
  logger = logging.getLogger("documentcloud")
16
16
 
17
+ # Per-endpoint rate limits applied on top of the global squarelet limit.
18
+ # Format: (method, url_pattern, rate_per_second, capacity)
19
+ #
20
+ # Endpoint Rate Burst Notes
21
+ # -------- ---- ----- -----
22
+ # GET documents/search 15/min 50
23
+ # POST documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
24
+ # PUT documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
25
+ # GET files/ 15/min 100 PDFs, full text, and other private assets
26
+ ENDPOINT_RATE_LIMITS = [
27
+ ("GET", "documents/search", 15 / 60, 50),
28
+ ("POST", "documents/", 12 / 60, 100),
29
+ ("PUT", "documents/", 12 / 60, 100),
30
+ ("GET", "files/", 15 / 60, 100),
31
+ ]
32
+
17
33
 
18
34
  class DocumentCloud(SquareletClient):
19
35
  """
@@ -51,8 +67,34 @@ class DocumentCloud(SquareletClient):
51
67
  else:
52
68
  logger.addHandler(logging.NullHandler())
53
69
 
70
+ # Build per-endpoint token bucket rate limiters
71
+ storage = token_bucket.MemoryStorage()
72
+ self._endpoint_limiters = [
73
+ (
74
+ pattern_method,
75
+ pattern,
76
+ token_bucket.Limiter(rate=rate, capacity=capacity, storage=storage),
77
+ f"{pattern_method}:{pattern}",
78
+ )
79
+ for pattern_method, pattern, rate, capacity in ENDPOINT_RATE_LIMITS
80
+ ]
81
+
54
82
  # Initialize the sub-clients using SquareletClient
55
83
  self.documents = DocumentClient(self)
56
84
  self.projects = ProjectClient(self)
57
85
  self.users = UserClient(self)
58
86
  self.organizations = OrganizationClient(self)
87
+
88
+ def request(self, method, url, raise_error=True, **kwargs):
89
+ for pattern_method, pattern, limiter, bucket_key in self._endpoint_limiters:
90
+ if pattern_method.upper() == method.upper() and pattern in url:
91
+ if not limiter.consume(bucket_key):
92
+ logger.warning(
93
+ "Rate limit reached for %s %s, throttling...",
94
+ method.upper(),
95
+ pattern,
96
+ )
97
+ while not limiter.consume(bucket_key):
98
+ time.sleep(0.1)
99
+ return super().request(method, url, raise_error=raise_error, **kwargs)
100
+ return super().request(method, url, raise_error=raise_error, **kwargs)
@@ -7,10 +7,13 @@ import datetime
7
7
  import logging
8
8
  import os
9
9
  import re
10
+ import time
10
11
  import warnings
11
12
  from functools import partial
13
+ from urllib.parse import urlparse
12
14
 
13
15
  # Third Party
16
+ import token_bucket
14
17
  from requests.exceptions import RequestException
15
18
 
16
19
  # Local
@@ -23,15 +26,12 @@ from .sections import SectionClient
23
26
  from .toolbox import grouper, is_url, merge_dicts, requests_retry_session
24
27
  from .users import User
25
28
 
26
- try:
27
- from urllib.parse import urlparse
28
- except ImportError:
29
- from urlparse import urlparse
30
-
31
29
  logger = logging.getLogger("documentcloud")
32
30
 
33
31
  IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]
34
32
 
33
+ DEFAULT_USER_AGENT = "python-documentcloud"
34
+
35
35
 
36
36
  class Document(BaseAPIObject):
37
37
  """A single DocumentCloud document"""
@@ -168,12 +168,17 @@ class Document(BaseAPIObject):
168
168
 
169
169
  if base_netloc == url_netloc:
170
170
  # if the url host is the same as the base api host,
171
- # sent the request with the client in order to include
171
+ # send the request with the client in order to include
172
172
  # authentication credentials
173
173
  response = self._client.get(url, full_url=True)
174
174
  else:
175
- response = requests_retry_session().get(
176
- url, headers={"User-Agent": "python-documentcloud2"}
175
+ response = self._client.documents.asset_get(
176
+ url,
177
+ headers={
178
+ "User-Agent": self._client.session.headers.get(
179
+ "User-Agent", DEFAULT_USER_AGENT
180
+ )
181
+ },
177
182
  )
178
183
  if fmt == "text":
179
184
  return response.content.decode("utf8")
@@ -250,6 +255,26 @@ class DocumentClient(BaseAPIClient):
250
255
  api_path = "documents"
251
256
  resource = Document
252
257
 
258
+ def __init__(self, client):
259
+ super().__init__(client)
260
+ # Rate limit for public document asset fetches (S3-hosted).
261
+ # Private document assets go through the API client and are limited there.
262
+ # Token bucket: burst of 100, sustained at 15/min (0.25/sec).
263
+ storage = token_bucket.MemoryStorage()
264
+ self._asset_limiter = token_bucket.Limiter(
265
+ rate=15 / 60,
266
+ capacity=100,
267
+ storage=storage,
268
+ )
269
+ self._asset_session = requests_retry_session()
270
+
271
+ def asset_get(self, url, **kwargs):
272
+ if not self._asset_limiter.consume("asset"):
273
+ logger.warning("Rate limit reached for asset fetch, throttling...")
274
+ while not self._asset_limiter.consume("asset"):
275
+ time.sleep(0.1)
276
+ return self._asset_session.get(url, **kwargs)
277
+
253
278
  def search(self, query, **params):
254
279
  """Return documents matching a search query"""
255
280
 
@@ -2,11 +2,14 @@
2
2
  Custom exceptions for python-documentcloud
3
3
  """
4
4
 
5
+ # Third Party
5
6
  # pylint: disable=unused-import
6
7
  # Import exceptions from python-squarelet
7
- from squarelet.exceptions import SquareletError as DocumentCloudError
8
- from squarelet.exceptions import DuplicateObjectError
9
- from squarelet.exceptions import CredentialsFailedError
10
- from squarelet.exceptions import APIError
11
- from squarelet.exceptions import DoesNotExistError
12
- from squarelet.exceptions import MultipleObjectsReturnedError
8
+ from squarelet.exceptions import (
9
+ APIError,
10
+ CredentialsFailedError,
11
+ DoesNotExistError,
12
+ DuplicateObjectError,
13
+ MultipleObjectsReturnedError,
14
+ SquareletError as DocumentCloudError,
15
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-documentcloud
3
- Version: 4.5.0
3
+ Version: 4.7.0
4
4
  Summary: A simple Python wrapper for the DocumentCloud API
5
5
  Home-page: https://github.com/muckrock/python-documentcloud
6
6
  Author: Mitchell Kotler
@@ -27,6 +27,7 @@ Requires-Dist: urllib3
27
27
  Requires-Dist: pyyaml
28
28
  Requires-Dist: fastjsonschema
29
29
  Requires-Dist: python-squarelet
30
+ Requires-Dist: token-bucket
30
31
  Provides-Extra: dev
31
32
  Requires-Dist: black; extra == "dev"
32
33
  Requires-Dist: coverage; extra == "dev"
@@ -37,6 +38,7 @@ Requires-Dist: twine; extra == "dev"
37
38
  Provides-Extra: test
38
39
  Requires-Dist: pytest; extra == "test"
39
40
  Requires-Dist: pytest-mock; extra == "test"
41
+ Requires-Dist: pytest-xdist; extra == "test"
40
42
  Requires-Dist: pytest-recording; extra == "test"
41
43
  Requires-Dist: vcrpy; extra == "test"
42
44
  Dynamic: author
@@ -1,18 +1,18 @@
1
1
  documentcloud/__init__.py,sha256=XAwOR6JYL-flQV_uC616AMA2rYiXTkeogNolqE6LzN4,220
2
- documentcloud/addon.py,sha256=3FxQjm26jknjLdd-GuztiZO4Z7NcgXq4WqunE9oh2es,11754
2
+ documentcloud/addon.py,sha256=BvELxbc5pm7vYxo8bWY1VLLcn-VJAQQwmy8Y-G7n26c,12402
3
3
  documentcloud/annotations.py,sha256=wVe3wYzyTRvc_hJ3r0m6iyDf6WIFlaGcCnyah_r53pg,2538
4
4
  documentcloud/base.py,sha256=pNF45aleYpQ9fj75CiL3c4Ssv6MO1EmdzZ6wBLPKHDg,6545
5
- documentcloud/client.py,sha256=WXHNE1BT-LE2E55XlOvPuWl_g5N0zUIdXvB7Qj_fMNc,1658
5
+ documentcloud/client.py,sha256=aLFncIVEnN9M3DN0SwL-t8jWIY9CR3Tr7AwSzEHHsP8,3542
6
6
  documentcloud/constants.py,sha256=h6NStSkxPrjQ2gzaIlqftCF7tthkRimddOE8SsmlHag,1828
7
- documentcloud/documents.py,sha256=dgoUr2XsxYmxC1xv3lJHgFQdJyE_rBNa2QS0Mn5Y2Is,18294
8
- documentcloud/exceptions.py,sha256=AwIJpcylq6sF6oaenrZE6nr2EBuj23nxTOf3z_RwtuE,461
7
+ documentcloud/documents.py,sha256=sixDwg0cqwv0c45vIL4MKd3uyPt987e9t7GgqjBeh4k,19307
8
+ documentcloud/exceptions.py,sha256=Fq_v7QBcvj-l4yeT7ii_1MrGAPiRs8e1Fwz8qtB4Xqc,344
9
9
  documentcloud/organizations.py,sha256=_Ot6MWzoa5JdU3jqedU-0Fec_K8WrgxqdlIp4oIijes,392
10
10
  documentcloud/projects.py,sha256=KuOiw65a-8fdgbjo7BqjbEbWguds8inkhFJZJd578bs,5328
11
11
  documentcloud/sections.py,sha256=cMf973KMvp6fAPSMXCD67L32Pz1_Tfh81oV2q2UQ9Uk,924
12
12
  documentcloud/toolbox.py,sha256=zFZTyOn40YZjBpqa1H3qjpR4C3Wu1X2g72AvH_ljlic,1835
13
13
  documentcloud/users.py,sha256=yydOXoEsfJlYqryZpXQ4G3aeRc5y_QCHqXd0dfF1aIc,354
14
- python_documentcloud-4.5.0.dist-info/licenses/LICENSE,sha256=Z1IBhHCzIeGR9F2iHtcLt2I2qoUhJ2pK139CAIAuFgo,1151
15
- python_documentcloud-4.5.0.dist-info/METADATA,sha256=90GM8QOJIaQfjjZ_KKyxqvkQr1rxKg6IpCnAI_FZS1I,2880
16
- python_documentcloud-4.5.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
17
- python_documentcloud-4.5.0.dist-info/top_level.txt,sha256=rzNW2vA9GqU5ipNQYSP1XJQ54ippjKXVIo9oMlM0Tm4,14
18
- python_documentcloud-4.5.0.dist-info/RECORD,,
14
+ python_documentcloud-4.7.0.dist-info/licenses/LICENSE,sha256=Z1IBhHCzIeGR9F2iHtcLt2I2qoUhJ2pK139CAIAuFgo,1151
15
+ python_documentcloud-4.7.0.dist-info/METADATA,sha256=QY7JfYCSWmqRz9amTLlZFedeGSw5OZuLqsTaOBcxEgs,2953
16
+ python_documentcloud-4.7.0.dist-info/WHEEL,sha256=TdQ5LtNwLuxTCjgxN51AgdU5w-KkB9ttmLbzjTH02pg,109
17
+ python_documentcloud-4.7.0.dist-info/top_level.txt,sha256=rzNW2vA9GqU5ipNQYSP1XJQ54ippjKXVIo9oMlM0Tm4,14
18
+ python_documentcloud-4.7.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (82.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any