python-documentcloud 4.6.0__py2.py3-none-any.whl → 4.7.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documentcloud/client.py +44 -2
- documentcloud/documents.py +32 -3
- {python_documentcloud-4.6.0.dist-info → python_documentcloud-4.7.0.dist-info}/METADATA +3 -1
- {python_documentcloud-4.6.0.dist-info → python_documentcloud-4.7.0.dist-info}/RECORD +7 -7
- {python_documentcloud-4.6.0.dist-info → python_documentcloud-4.7.0.dist-info}/WHEEL +0 -0
- {python_documentcloud-4.6.0.dist-info → python_documentcloud-4.7.0.dist-info}/licenses/LICENSE +0 -0
- {python_documentcloud-4.6.0.dist-info → python_documentcloud-4.7.0.dist-info}/top_level.txt +0 -0
documentcloud/client.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
# Import SquareletClient from python-squarelet
|
|
2
1
|
# Standard Library
|
|
3
2
|
import logging
|
|
3
|
+
import time
|
|
4
4
|
|
|
5
5
|
# Third Party
|
|
6
|
+
import token_bucket
|
|
6
7
|
from squarelet import SquareletClient
|
|
7
8
|
|
|
8
9
|
# Local
|
|
9
|
-
# Local Imports
|
|
10
10
|
from .documents import DocumentClient
|
|
11
11
|
from .organizations import OrganizationClient
|
|
12
12
|
from .projects import ProjectClient
|
|
@@ -14,6 +14,22 @@ from .users import UserClient
|
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger("documentcloud")
|
|
16
16
|
|
|
17
|
+
# Per-endpoint rate limits applied on top of the global squarelet limit.
|
|
18
|
+
# Format: (method, url_pattern, rate_per_second, capacity)
|
|
19
|
+
#
|
|
20
|
+
# Endpoint Rate Burst Notes
|
|
21
|
+
# -------- ---- ----- -----
|
|
22
|
+
# GET documents/search 15/min 50
|
|
23
|
+
# POST documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
|
|
24
|
+
# PUT documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
|
|
25
|
+
# GET files/ 15/min 100 PDFs, full text, and other private assets
|
|
26
|
+
ENDPOINT_RATE_LIMITS = [
|
|
27
|
+
("GET", "documents/search", 15 / 60, 50),
|
|
28
|
+
("POST", "documents/", 12 / 60, 100),
|
|
29
|
+
("PUT", "documents/", 12 / 60, 100),
|
|
30
|
+
("GET", "files/", 15 / 60, 100),
|
|
31
|
+
]
|
|
32
|
+
|
|
17
33
|
|
|
18
34
|
class DocumentCloud(SquareletClient):
|
|
19
35
|
"""
|
|
@@ -51,8 +67,34 @@ class DocumentCloud(SquareletClient):
|
|
|
51
67
|
else:
|
|
52
68
|
logger.addHandler(logging.NullHandler())
|
|
53
69
|
|
|
70
|
+
# Build per-endpoint token bucket rate limiters
|
|
71
|
+
storage = token_bucket.MemoryStorage()
|
|
72
|
+
self._endpoint_limiters = [
|
|
73
|
+
(
|
|
74
|
+
pattern_method,
|
|
75
|
+
pattern,
|
|
76
|
+
token_bucket.Limiter(rate=rate, capacity=capacity, storage=storage),
|
|
77
|
+
f"{pattern_method}:{pattern}",
|
|
78
|
+
)
|
|
79
|
+
for pattern_method, pattern, rate, capacity in ENDPOINT_RATE_LIMITS
|
|
80
|
+
]
|
|
81
|
+
|
|
54
82
|
# Initialize the sub-clients using SquareletClient
|
|
55
83
|
self.documents = DocumentClient(self)
|
|
56
84
|
self.projects = ProjectClient(self)
|
|
57
85
|
self.users = UserClient(self)
|
|
58
86
|
self.organizations = OrganizationClient(self)
|
|
87
|
+
|
|
88
|
+
def request(self, method, url, raise_error=True, **kwargs):
|
|
89
|
+
for pattern_method, pattern, limiter, bucket_key in self._endpoint_limiters:
|
|
90
|
+
if pattern_method.upper() == method.upper() and pattern in url:
|
|
91
|
+
if not limiter.consume(bucket_key):
|
|
92
|
+
logger.warning(
|
|
93
|
+
"Rate limit reached for %s %s, throttling...",
|
|
94
|
+
method.upper(),
|
|
95
|
+
pattern,
|
|
96
|
+
)
|
|
97
|
+
while not limiter.consume(bucket_key):
|
|
98
|
+
time.sleep(0.1)
|
|
99
|
+
return super().request(method, url, raise_error=raise_error, **kwargs)
|
|
100
|
+
return super().request(method, url, raise_error=raise_error, **kwargs)
|
documentcloud/documents.py
CHANGED
|
@@ -7,11 +7,13 @@ import datetime
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
import re
|
|
10
|
+
import time
|
|
10
11
|
import warnings
|
|
11
12
|
from functools import partial
|
|
12
13
|
from urllib.parse import urlparse
|
|
13
14
|
|
|
14
15
|
# Third Party
|
|
16
|
+
import token_bucket
|
|
15
17
|
from requests.exceptions import RequestException
|
|
16
18
|
|
|
17
19
|
# Local
|
|
@@ -28,6 +30,8 @@ logger = logging.getLogger("documentcloud")
|
|
|
28
30
|
|
|
29
31
|
IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]
|
|
30
32
|
|
|
33
|
+
DEFAULT_USER_AGENT = "python-documentcloud"
|
|
34
|
+
|
|
31
35
|
|
|
32
36
|
class Document(BaseAPIObject):
|
|
33
37
|
"""A single DocumentCloud document"""
|
|
@@ -164,12 +168,17 @@ class Document(BaseAPIObject):
|
|
|
164
168
|
|
|
165
169
|
if base_netloc == url_netloc:
|
|
166
170
|
# if the url host is the same as the base api host,
|
|
167
|
-
#
|
|
171
|
+
# send the request with the client in order to include
|
|
168
172
|
# authentication credentials
|
|
169
173
|
response = self._client.get(url, full_url=True)
|
|
170
174
|
else:
|
|
171
|
-
response =
|
|
172
|
-
url,
|
|
175
|
+
response = self._client.documents.asset_get(
|
|
176
|
+
url,
|
|
177
|
+
headers={
|
|
178
|
+
"User-Agent": self._client.session.headers.get(
|
|
179
|
+
"User-Agent", DEFAULT_USER_AGENT
|
|
180
|
+
)
|
|
181
|
+
},
|
|
173
182
|
)
|
|
174
183
|
if fmt == "text":
|
|
175
184
|
return response.content.decode("utf8")
|
|
@@ -246,6 +255,26 @@ class DocumentClient(BaseAPIClient):
|
|
|
246
255
|
api_path = "documents"
|
|
247
256
|
resource = Document
|
|
248
257
|
|
|
258
|
+
def __init__(self, client):
|
|
259
|
+
super().__init__(client)
|
|
260
|
+
# Rate limit for public document asset fetches (S3-hosted).
|
|
261
|
+
# Private document assets go through the API client and are limited there.
|
|
262
|
+
# Token bucket: burst of 100, sustained at 15/min (0.25/sec).
|
|
263
|
+
storage = token_bucket.MemoryStorage()
|
|
264
|
+
self._asset_limiter = token_bucket.Limiter(
|
|
265
|
+
rate=15 / 60,
|
|
266
|
+
capacity=100,
|
|
267
|
+
storage=storage,
|
|
268
|
+
)
|
|
269
|
+
self._asset_session = requests_retry_session()
|
|
270
|
+
|
|
271
|
+
def asset_get(self, url, **kwargs):
|
|
272
|
+
if not self._asset_limiter.consume("asset"):
|
|
273
|
+
logger.warning("Rate limit reached for asset fetch, throttling...")
|
|
274
|
+
while not self._asset_limiter.consume("asset"):
|
|
275
|
+
time.sleep(0.1)
|
|
276
|
+
return self._asset_session.get(url, **kwargs)
|
|
277
|
+
|
|
249
278
|
def search(self, query, **params):
|
|
250
279
|
"""Return documents matching a search query"""
|
|
251
280
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: python-documentcloud
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.7.0
|
|
4
4
|
Summary: A simple Python wrapper for the DocumentCloud API
|
|
5
5
|
Home-page: https://github.com/muckrock/python-documentcloud
|
|
6
6
|
Author: Mitchell Kotler
|
|
@@ -27,6 +27,7 @@ Requires-Dist: urllib3
|
|
|
27
27
|
Requires-Dist: pyyaml
|
|
28
28
|
Requires-Dist: fastjsonschema
|
|
29
29
|
Requires-Dist: python-squarelet
|
|
30
|
+
Requires-Dist: token-bucket
|
|
30
31
|
Provides-Extra: dev
|
|
31
32
|
Requires-Dist: black; extra == "dev"
|
|
32
33
|
Requires-Dist: coverage; extra == "dev"
|
|
@@ -37,6 +38,7 @@ Requires-Dist: twine; extra == "dev"
|
|
|
37
38
|
Provides-Extra: test
|
|
38
39
|
Requires-Dist: pytest; extra == "test"
|
|
39
40
|
Requires-Dist: pytest-mock; extra == "test"
|
|
41
|
+
Requires-Dist: pytest-xdist; extra == "test"
|
|
40
42
|
Requires-Dist: pytest-recording; extra == "test"
|
|
41
43
|
Requires-Dist: vcrpy; extra == "test"
|
|
42
44
|
Dynamic: author
|
|
@@ -2,17 +2,17 @@ documentcloud/__init__.py,sha256=XAwOR6JYL-flQV_uC616AMA2rYiXTkeogNolqE6LzN4,220
|
|
|
2
2
|
documentcloud/addon.py,sha256=BvELxbc5pm7vYxo8bWY1VLLcn-VJAQQwmy8Y-G7n26c,12402
|
|
3
3
|
documentcloud/annotations.py,sha256=wVe3wYzyTRvc_hJ3r0m6iyDf6WIFlaGcCnyah_r53pg,2538
|
|
4
4
|
documentcloud/base.py,sha256=pNF45aleYpQ9fj75CiL3c4Ssv6MO1EmdzZ6wBLPKHDg,6545
|
|
5
|
-
documentcloud/client.py,sha256=
|
|
5
|
+
documentcloud/client.py,sha256=aLFncIVEnN9M3DN0SwL-t8jWIY9CR3Tr7AwSzEHHsP8,3542
|
|
6
6
|
documentcloud/constants.py,sha256=h6NStSkxPrjQ2gzaIlqftCF7tthkRimddOE8SsmlHag,1828
|
|
7
|
-
documentcloud/documents.py,sha256=
|
|
7
|
+
documentcloud/documents.py,sha256=sixDwg0cqwv0c45vIL4MKd3uyPt987e9t7GgqjBeh4k,19307
|
|
8
8
|
documentcloud/exceptions.py,sha256=Fq_v7QBcvj-l4yeT7ii_1MrGAPiRs8e1Fwz8qtB4Xqc,344
|
|
9
9
|
documentcloud/organizations.py,sha256=_Ot6MWzoa5JdU3jqedU-0Fec_K8WrgxqdlIp4oIijes,392
|
|
10
10
|
documentcloud/projects.py,sha256=KuOiw65a-8fdgbjo7BqjbEbWguds8inkhFJZJd578bs,5328
|
|
11
11
|
documentcloud/sections.py,sha256=cMf973KMvp6fAPSMXCD67L32Pz1_Tfh81oV2q2UQ9Uk,924
|
|
12
12
|
documentcloud/toolbox.py,sha256=zFZTyOn40YZjBpqa1H3qjpR4C3Wu1X2g72AvH_ljlic,1835
|
|
13
13
|
documentcloud/users.py,sha256=yydOXoEsfJlYqryZpXQ4G3aeRc5y_QCHqXd0dfF1aIc,354
|
|
14
|
-
python_documentcloud-4.
|
|
15
|
-
python_documentcloud-4.
|
|
16
|
-
python_documentcloud-4.
|
|
17
|
-
python_documentcloud-4.
|
|
18
|
-
python_documentcloud-4.
|
|
14
|
+
python_documentcloud-4.7.0.dist-info/licenses/LICENSE,sha256=Z1IBhHCzIeGR9F2iHtcLt2I2qoUhJ2pK139CAIAuFgo,1151
|
|
15
|
+
python_documentcloud-4.7.0.dist-info/METADATA,sha256=QY7JfYCSWmqRz9amTLlZFedeGSw5OZuLqsTaOBcxEgs,2953
|
|
16
|
+
python_documentcloud-4.7.0.dist-info/WHEEL,sha256=TdQ5LtNwLuxTCjgxN51AgdU5w-KkB9ttmLbzjTH02pg,109
|
|
17
|
+
python_documentcloud-4.7.0.dist-info/top_level.txt,sha256=rzNW2vA9GqU5ipNQYSP1XJQ54ippjKXVIo9oMlM0Tm4,14
|
|
18
|
+
python_documentcloud-4.7.0.dist-info/RECORD,,
|
|
File without changes
|
{python_documentcloud-4.6.0.dist-info → python_documentcloud-4.7.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|