python-documentcloud 4.6.0__tar.gz → 4.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {python_documentcloud-4.6.0/python_documentcloud.egg-info → python_documentcloud-4.7.0}/PKG-INFO +3 -1
  2. python_documentcloud-4.7.0/documentcloud/client.py +100 -0
  3. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/documents.py +32 -3
  4. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0/python_documentcloud.egg-info}/PKG-INFO +3 -1
  5. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/python_documentcloud.egg-info/requires.txt +2 -0
  6. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/setup.py +3 -1
  7. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_client.py +75 -0
  8. python_documentcloud-4.6.0/documentcloud/client.py +0 -58
  9. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/LICENSE +0 -0
  10. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/README.md +0 -0
  11. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/__init__.py +0 -0
  12. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/addon.py +0 -0
  13. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/annotations.py +0 -0
  14. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/base.py +0 -0
  15. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/constants.py +0 -0
  16. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/exceptions.py +0 -0
  17. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/organizations.py +0 -0
  18. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/projects.py +0 -0
  19. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/sections.py +0 -0
  20. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/toolbox.py +0 -0
  21. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/documentcloud/users.py +0 -0
  22. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/python_documentcloud.egg-info/SOURCES.txt +0 -0
  23. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/python_documentcloud.egg-info/dependency_links.txt +0 -0
  24. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/python_documentcloud.egg-info/top_level.txt +0 -0
  25. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/setup.cfg +0 -0
  26. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_addon.py +0 -0
  27. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_annotations.py +0 -0
  28. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_base.py +0 -0
  29. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_documents.py +0 -0
  30. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_organizations.py +0 -0
  31. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_projects.py +0 -0
  32. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_sections.py +0 -0
  33. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_toolbox.py +0 -0
  34. {python_documentcloud-4.6.0 → python_documentcloud-4.7.0}/tests/test_users.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-documentcloud
3
- Version: 4.6.0
3
+ Version: 4.7.0
4
4
  Summary: A simple Python wrapper for the DocumentCloud API
5
5
  Home-page: https://github.com/muckrock/python-documentcloud
6
6
  Author: Mitchell Kotler
@@ -27,6 +27,7 @@ Requires-Dist: urllib3
27
27
  Requires-Dist: pyyaml
28
28
  Requires-Dist: fastjsonschema
29
29
  Requires-Dist: python-squarelet
30
+ Requires-Dist: token-bucket
30
31
  Provides-Extra: dev
31
32
  Requires-Dist: black; extra == "dev"
32
33
  Requires-Dist: coverage; extra == "dev"
@@ -37,6 +38,7 @@ Requires-Dist: twine; extra == "dev"
37
38
  Provides-Extra: test
38
39
  Requires-Dist: pytest; extra == "test"
39
40
  Requires-Dist: pytest-mock; extra == "test"
41
+ Requires-Dist: pytest-xdist; extra == "test"
40
42
  Requires-Dist: pytest-recording; extra == "test"
41
43
  Requires-Dist: vcrpy; extra == "test"
42
44
  Dynamic: author
@@ -0,0 +1,100 @@
1
+ # Standard Library
2
+ import logging
3
+ import time
4
+
5
+ # Third Party
6
+ import token_bucket
7
+ from squarelet import SquareletClient
8
+
9
+ # Local
10
+ from .documents import DocumentClient
11
+ from .organizations import OrganizationClient
12
+ from .projects import ProjectClient
13
+ from .users import UserClient
14
+
15
+ logger = logging.getLogger("documentcloud")
16
+
17
+ # Per-endpoint rate limits applied on top of the global squarelet limit.
18
+ # Format: (method, url_pattern, rate_per_second, capacity)
19
+ #
20
+ # Endpoint Rate Burst Notes
21
+ # -------- ---- ----- -----
22
+ # GET documents/search 15/min 50
23
+ # POST documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
24
+ # PUT documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
25
+ # GET files/ 15/min 100 PDFs, full text, and other private assets
26
+ ENDPOINT_RATE_LIMITS = [
27
+ ("GET", "documents/search", 15 / 60, 50),
28
+ ("POST", "documents/", 12 / 60, 100),
29
+ ("PUT", "documents/", 12 / 60, 100),
30
+ ("GET", "files/", 15 / 60, 100),
31
+ ]
32
+
33
+
34
+ class DocumentCloud(SquareletClient):
35
+ """
36
+ The public interface for the DocumentCloud API, now integrated with SquareletClient
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ username=None,
42
+ password=None,
43
+ base_uri="https://api.www.documentcloud.org/api/",
44
+ auth_uri="https://accounts.muckrock.com/api/",
45
+ timeout=20,
46
+ loglevel=None,
47
+ rate_limit=True,
48
+ rate_limit_sleep=True,
49
+ ):
50
+ # Initialize SquareletClient for authentication and request handling
51
+ super().__init__(
52
+ base_uri=base_uri,
53
+ username=username,
54
+ password=password,
55
+ auth_uri=auth_uri,
56
+ timeout=timeout,
57
+ rate_limit=rate_limit,
58
+ rate_limit_sleep=rate_limit_sleep,
59
+ )
60
+
61
+ # Set up logging
62
+ if loglevel:
63
+ logging.basicConfig(
64
+ level=loglevel,
65
+ format="%(asctime)s %(levelname)-8s %(name)-25s %(message)s",
66
+ )
67
+ else:
68
+ logger.addHandler(logging.NullHandler())
69
+
70
+ # Build per-endpoint token bucket rate limiters
71
+ storage = token_bucket.MemoryStorage()
72
+ self._endpoint_limiters = [
73
+ (
74
+ pattern_method,
75
+ pattern,
76
+ token_bucket.Limiter(rate=rate, capacity=capacity, storage=storage),
77
+ f"{pattern_method}:{pattern}",
78
+ )
79
+ for pattern_method, pattern, rate, capacity in ENDPOINT_RATE_LIMITS
80
+ ]
81
+
82
+ # Initialize the sub-clients using SquareletClient
83
+ self.documents = DocumentClient(self)
84
+ self.projects = ProjectClient(self)
85
+ self.users = UserClient(self)
86
+ self.organizations = OrganizationClient(self)
87
+
88
+ def request(self, method, url, raise_error=True, **kwargs):
89
+ for pattern_method, pattern, limiter, bucket_key in self._endpoint_limiters:
90
+ if pattern_method.upper() == method.upper() and pattern in url:
91
+ if not limiter.consume(bucket_key):
92
+ logger.warning(
93
+ "Rate limit reached for %s %s, throttling...",
94
+ method.upper(),
95
+ pattern,
96
+ )
97
+ while not limiter.consume(bucket_key):
98
+ time.sleep(0.1)
99
+ return super().request(method, url, raise_error=raise_error, **kwargs)
100
+ return super().request(method, url, raise_error=raise_error, **kwargs)
@@ -7,11 +7,13 @@ import datetime
7
7
  import logging
8
8
  import os
9
9
  import re
10
+ import time
10
11
  import warnings
11
12
  from functools import partial
12
13
  from urllib.parse import urlparse
13
14
 
14
15
  # Third Party
16
+ import token_bucket
15
17
  from requests.exceptions import RequestException
16
18
 
17
19
  # Local
@@ -28,6 +30,8 @@ logger = logging.getLogger("documentcloud")
28
30
 
29
31
  IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]
30
32
 
33
+ DEFAULT_USER_AGENT = "python-documentcloud"
34
+
31
35
 
32
36
  class Document(BaseAPIObject):
33
37
  """A single DocumentCloud document"""
@@ -164,12 +168,17 @@ class Document(BaseAPIObject):
164
168
 
165
169
  if base_netloc == url_netloc:
166
170
  # if the url host is the same as the base api host,
167
- # sent the request with the client in order to include
171
+ # send the request with the client in order to include
168
172
  # authentication credentials
169
173
  response = self._client.get(url, full_url=True)
170
174
  else:
171
- response = requests_retry_session().get(
172
- url, headers={"User-Agent": "python-documentcloud2"}
175
+ response = self._client.documents.asset_get(
176
+ url,
177
+ headers={
178
+ "User-Agent": self._client.session.headers.get(
179
+ "User-Agent", DEFAULT_USER_AGENT
180
+ )
181
+ },
173
182
  )
174
183
  if fmt == "text":
175
184
  return response.content.decode("utf8")
@@ -246,6 +255,26 @@ class DocumentClient(BaseAPIClient):
246
255
  api_path = "documents"
247
256
  resource = Document
248
257
 
258
+ def __init__(self, client):
259
+ super().__init__(client)
260
+ # Rate limit for public document asset fetches (S3-hosted).
261
+ # Private document assets go through the API client and are limited there.
262
+ # Token bucket: burst of 100, sustained at 15/min (0.25/sec).
263
+ storage = token_bucket.MemoryStorage()
264
+ self._asset_limiter = token_bucket.Limiter(
265
+ rate=15 / 60,
266
+ capacity=100,
267
+ storage=storage,
268
+ )
269
+ self._asset_session = requests_retry_session()
270
+
271
+ def asset_get(self, url, **kwargs):
272
+ if not self._asset_limiter.consume("asset"):
273
+ logger.warning("Rate limit reached for asset fetch, throttling...")
274
+ while not self._asset_limiter.consume("asset"):
275
+ time.sleep(0.1)
276
+ return self._asset_session.get(url, **kwargs)
277
+
249
278
  def search(self, query, **params):
250
279
  """Return documents matching a search query"""
251
280
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-documentcloud
3
- Version: 4.6.0
3
+ Version: 4.7.0
4
4
  Summary: A simple Python wrapper for the DocumentCloud API
5
5
  Home-page: https://github.com/muckrock/python-documentcloud
6
6
  Author: Mitchell Kotler
@@ -27,6 +27,7 @@ Requires-Dist: urllib3
27
27
  Requires-Dist: pyyaml
28
28
  Requires-Dist: fastjsonschema
29
29
  Requires-Dist: python-squarelet
30
+ Requires-Dist: token-bucket
30
31
  Provides-Extra: dev
31
32
  Requires-Dist: black; extra == "dev"
32
33
  Requires-Dist: coverage; extra == "dev"
@@ -37,6 +38,7 @@ Requires-Dist: twine; extra == "dev"
37
38
  Provides-Extra: test
38
39
  Requires-Dist: pytest; extra == "test"
39
40
  Requires-Dist: pytest-mock; extra == "test"
41
+ Requires-Dist: pytest-xdist; extra == "test"
40
42
  Requires-Dist: pytest-recording; extra == "test"
41
43
  Requires-Dist: vcrpy; extra == "test"
42
44
  Dynamic: author
@@ -7,6 +7,7 @@ urllib3
7
7
  pyyaml
8
8
  fastjsonschema
9
9
  python-squarelet
10
+ token-bucket
10
11
 
11
12
  [dev]
12
13
  black
@@ -19,5 +20,6 @@ twine
19
20
  [test]
20
21
  pytest
21
22
  pytest-mock
23
+ pytest-xdist
22
24
  pytest-recording
23
25
  vcrpy
@@ -7,7 +7,7 @@ with open("README.md", "r") as fh:
7
7
 
8
8
  setup(
9
9
  name="python-documentcloud",
10
- version="4.6.0",
10
+ version="4.7.0",
11
11
  description="A simple Python wrapper for the DocumentCloud API",
12
12
  author="Mitchell Kotler",
13
13
  author_email="mitch@muckrock.com",
@@ -27,6 +27,7 @@ setup(
27
27
  "pyyaml",
28
28
  "fastjsonschema",
29
29
  "python-squarelet",
30
+ "token-bucket",
30
31
  ),
31
32
  extras_require={
32
33
  "dev": [
@@ -40,6 +41,7 @@ setup(
40
41
  "test": [
41
42
  "pytest",
42
43
  "pytest-mock",
44
+ "pytest-xdist",
43
45
  "pytest-recording",
44
46
  "vcrpy",
45
47
  ],
@@ -9,6 +9,7 @@ import pytest
9
9
  import ratelimit
10
10
 
11
11
  # DocumentCloud
12
+ from documentcloud import DocumentCloud
12
13
  from documentcloud.constants import RATE_LIMIT
13
14
  from documentcloud.exceptions import APIError, CredentialsFailedError
14
15
 
@@ -111,3 +112,77 @@ def test_expired_refresh_token(short_client, record_mode):
111
112
  assert short_client.users.get("me")
112
113
  # check the refresh token was updated
113
114
  assert old_refresh_token != short_client.refresh_token
115
+
116
+
117
+ def test_endpoint_rate_limit_burst_exhaustion():
118
+ """Token bucket should block after burst capacity is exhausted"""
119
+ client = DocumentCloud()
120
+ # Exhaust the search burst (capacity=50)
121
+ _pattern_method, _pattern, limiter, bucket_key = client._endpoint_limiters[0]
122
+ for _ in range(50):
123
+ limiter.consume(bucket_key)
124
+ assert not limiter.consume(bucket_key)
125
+
126
+
127
+ def test_endpoint_rate_limit_method_specificity():
128
+ """GET and POST to documents/ should use different limiters"""
129
+ client = DocumentCloud()
130
+ limiters = {(pm, p): lim for pm, p, lim, _ in client._endpoint_limiters}
131
+ assert limiters[("GET", "files/")] is not limiters[("POST", "documents/")]
132
+
133
+
134
+ def test_endpoint_rate_limit_pattern_ordering():
135
+ """documents/search should match before documents/"""
136
+ client = DocumentCloud()
137
+ url = "documents/search/"
138
+ matched = next(
139
+ p for pm, p, _, _ in client._endpoint_limiters if pm == "GET" and p in url
140
+ )
141
+ assert matched == "documents/search"
142
+
143
+
144
+ def test_asset_rate_limit_burst_exhaustion():
145
+ """Asset token bucket should block after burst capacity is exhausted"""
146
+ client = DocumentCloud()
147
+ limiter = client.documents._asset_limiter
148
+ for _ in range(100):
149
+ limiter.consume("asset")
150
+ assert not limiter.consume("asset")
151
+
152
+
153
+ def test_asset_rate_limit_refills():
154
+ """Asset token bucket should refill over time"""
155
+ client = DocumentCloud()
156
+ limiter = client.documents._asset_limiter
157
+ for _ in range(100):
158
+ limiter.consume("asset")
159
+ assert not limiter.consume("asset")
160
+ time.sleep(5)
161
+ assert limiter.consume("asset")
162
+
163
+
164
+ def test_endpoint_rate_limit_buckets_are_independent():
165
+ """Exhausting one endpoint's bucket should not affect another"""
166
+ client = DocumentCloud()
167
+ limiters = {(pm, p): (lim, bk) for pm, p, lim, bk in client._endpoint_limiters}
168
+ search_limiter, search_key = limiters[("GET", "documents/search")]
169
+ files_limiter, files_key = limiters[("GET", "files/")]
170
+
171
+ # Exhaust search bucket
172
+ for _ in range(50):
173
+ search_limiter.consume(search_key)
174
+ assert not search_limiter.consume(search_key)
175
+
176
+ # Files bucket should still have tokens
177
+ assert files_limiter.consume(files_key)
178
+
179
+
180
+ def test_endpoint_rate_limit_no_match_for_unrecognized_url():
181
+ """Unrecognized URLs should not match any endpoint limiter"""
182
+ client = DocumentCloud()
183
+ url = "users/me/"
184
+ matched = next(
185
+ (p for pm, p, _, _ in client._endpoint_limiters if p in url),
186
+ None,
187
+ )
188
+ assert matched is None
@@ -1,58 +0,0 @@
1
- # Import SquareletClient from python-squarelet
2
- # Standard Library
3
- import logging
4
-
5
- # Third Party
6
- from squarelet import SquareletClient
7
-
8
- # Local
9
- # Local Imports
10
- from .documents import DocumentClient
11
- from .organizations import OrganizationClient
12
- from .projects import ProjectClient
13
- from .users import UserClient
14
-
15
- logger = logging.getLogger("documentcloud")
16
-
17
-
18
- class DocumentCloud(SquareletClient):
19
- """
20
- The public interface for the DocumentCloud API, now integrated with SquareletClient
21
- """
22
-
23
- def __init__(
24
- self,
25
- username=None,
26
- password=None,
27
- base_uri="https://api.www.documentcloud.org/api/",
28
- auth_uri="https://accounts.muckrock.com/api/",
29
- timeout=20,
30
- loglevel=None,
31
- rate_limit=True,
32
- rate_limit_sleep=True,
33
- ):
34
- # Initialize SquareletClient for authentication and request handling
35
- super().__init__(
36
- base_uri=base_uri,
37
- username=username,
38
- password=password,
39
- auth_uri=auth_uri,
40
- timeout=timeout,
41
- rate_limit=rate_limit,
42
- rate_limit_sleep=rate_limit_sleep,
43
- )
44
-
45
- # Set up logging
46
- if loglevel:
47
- logging.basicConfig(
48
- level=loglevel,
49
- format="%(asctime)s %(levelname)-8s %(name)-25s %(message)s",
50
- )
51
- else:
52
- logger.addHandler(logging.NullHandler())
53
-
54
- # Initialize the sub-clients using SquareletClient
55
- self.documents = DocumentClient(self)
56
- self.projects = ProjectClient(self)
57
- self.users = UserClient(self)
58
- self.organizations = OrganizationClient(self)