python-documentcloud 3.7.1__py2.py3-none-any.whl → 4.0.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documentcloud/addon.py +33 -3
- documentcloud/annotations.py +3 -8
- documentcloud/base.py +21 -40
- documentcloud/client.py +8 -13
- documentcloud/constants.py +93 -96
- documentcloud/documents.py +80 -74
- documentcloud/exceptions.py +2 -4
- documentcloud/organizations.py +0 -7
- documentcloud/projects.py +11 -22
- documentcloud/sections.py +4 -11
- documentcloud/toolbox.py +4 -9
- documentcloud/users.py +0 -7
- {python_documentcloud-3.7.1.dist-info → python_documentcloud-4.0.0.dist-info}/METADATA +15 -18
- python_documentcloud-4.0.0.dist-info/RECORD +18 -0
- python_documentcloud-3.7.1.dist-info/RECORD +0 -18
- {python_documentcloud-3.7.1.dist-info → python_documentcloud-4.0.0.dist-info}/LICENSE +0 -0
- {python_documentcloud-3.7.1.dist-info → python_documentcloud-4.0.0.dist-info}/WHEEL +0 -0
- {python_documentcloud-3.7.1.dist-info → python_documentcloud-4.0.0.dist-info}/top_level.txt +0 -0
documentcloud/addon.py
CHANGED
|
@@ -24,7 +24,7 @@ class BaseAddOn:
|
|
|
24
24
|
|
|
25
25
|
def __init__(self):
|
|
26
26
|
args = self._parse_arguments()
|
|
27
|
-
|
|
27
|
+
self._create_client(args)
|
|
28
28
|
|
|
29
29
|
# a unique identifier for this run
|
|
30
30
|
self.id = args.pop("id", None)
|
|
@@ -42,6 +42,8 @@ class BaseAddOn:
|
|
|
42
42
|
self.org_id = args.pop("organization", None)
|
|
43
43
|
# add on specific data
|
|
44
44
|
self.data = args.pop("data", None)
|
|
45
|
+
# title of the addon
|
|
46
|
+
self.title = args.pop("title", None)
|
|
45
47
|
|
|
46
48
|
def _create_client(self, args):
|
|
47
49
|
client_kwargs = {
|
|
@@ -63,7 +65,7 @@ class BaseAddOn:
|
|
|
63
65
|
self.client.refresh_token = args["refresh_token"]
|
|
64
66
|
if args["token"] is not None:
|
|
65
67
|
self.client.session.headers.update(
|
|
66
|
-
{"Authorization": "Bearer {
|
|
68
|
+
{"Authorization": f"Bearer {args['token']}"}
|
|
67
69
|
)
|
|
68
70
|
|
|
69
71
|
# custom user agent for AddOns
|
|
@@ -117,9 +119,11 @@ class BaseAddOn:
|
|
|
117
119
|
|
|
118
120
|
# validate parameter data
|
|
119
121
|
try:
|
|
120
|
-
with open("config.yaml") as config:
|
|
122
|
+
with open("config.yaml", encoding="utf-8") as config:
|
|
121
123
|
schema = yaml.safe_load(config)
|
|
122
124
|
args["data"] = fastjsonschema.validate(schema, args["data"])
|
|
125
|
+
# add title in case the add-on wants to reference its own title
|
|
126
|
+
args["title"] = schema.get("title")
|
|
123
127
|
except FileNotFoundError:
|
|
124
128
|
pass
|
|
125
129
|
except fastjsonschema.JsonSchemaException as exc:
|
|
@@ -171,6 +175,7 @@ class AddOn(BaseAddOn):
|
|
|
171
175
|
else:
|
|
172
176
|
# text file's buffer is in binary mode
|
|
173
177
|
data = file.buffer
|
|
178
|
+
# pylint: disable=W3101
|
|
174
179
|
response = requests.put(presigned_url, data=data)
|
|
175
180
|
response.raise_for_status()
|
|
176
181
|
return self.client.patch(
|
|
@@ -203,6 +208,8 @@ class AddOn(BaseAddOn):
|
|
|
203
208
|
documents = self.client.documents.search(self.query)
|
|
204
209
|
return documents.count
|
|
205
210
|
|
|
211
|
+
return 0
|
|
212
|
+
|
|
206
213
|
def get_documents(self):
|
|
207
214
|
"""Get documents from either selected or queried documents"""
|
|
208
215
|
if self.documents:
|
|
@@ -214,6 +221,29 @@ class AddOn(BaseAddOn):
|
|
|
214
221
|
|
|
215
222
|
yield from documents
|
|
216
223
|
|
|
224
|
+
def charge_credits(self, amount):
|
|
225
|
+
"""Charge the organization a certain amount of premium credits"""
|
|
226
|
+
|
|
227
|
+
if not self.id:
|
|
228
|
+
print(f"Charge credits: {amount}")
|
|
229
|
+
return None
|
|
230
|
+
elif not self.org_id:
|
|
231
|
+
self.set_message("No organization to charge.")
|
|
232
|
+
raise ValueError
|
|
233
|
+
|
|
234
|
+
resp = self.client.post(
|
|
235
|
+
f"organizations/{self.org_id}/ai_credits/",
|
|
236
|
+
json={
|
|
237
|
+
"ai_credits": amount,
|
|
238
|
+
"addonrun_id": self.id,
|
|
239
|
+
"note": f"AddOn run: {self.title} - {self.id}",
|
|
240
|
+
},
|
|
241
|
+
)
|
|
242
|
+
if resp.status_code != 200:
|
|
243
|
+
self.set_message("Error charging AI credits.")
|
|
244
|
+
raise ValueError
|
|
245
|
+
return resp
|
|
246
|
+
|
|
217
247
|
|
|
218
248
|
class CronAddOn(BaseAddOn):
|
|
219
249
|
"""DEPREACTED"""
|
documentcloud/annotations.py
CHANGED
|
@@ -1,8 +1,4 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
1
|
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
2
|
from listcrunch.listcrunch import uncrunch
|
|
7
3
|
|
|
8
4
|
# Local
|
|
@@ -10,7 +6,6 @@ from .base import BaseAPIObject, ChildAPIClient
|
|
|
10
6
|
from .toolbox import merge_dicts
|
|
11
7
|
|
|
12
8
|
|
|
13
|
-
@python_2_unicode_compatible
|
|
14
9
|
class Annotation(BaseAPIObject):
|
|
15
10
|
"""A note on a document"""
|
|
16
11
|
|
|
@@ -30,7 +25,7 @@ class Annotation(BaseAPIObject):
|
|
|
30
25
|
|
|
31
26
|
@property
|
|
32
27
|
def api_path(self):
|
|
33
|
-
return "documents/{
|
|
28
|
+
return f"documents/{self.document.id}/notes"
|
|
34
29
|
|
|
35
30
|
@property
|
|
36
31
|
def location(self):
|
|
@@ -71,7 +66,7 @@ class AnnotationClient(ChildAPIClient):
|
|
|
71
66
|
|
|
72
67
|
@property
|
|
73
68
|
def api_path(self):
|
|
74
|
-
return "documents/{
|
|
69
|
+
return f"documents/{self.parent.id}/notes"
|
|
75
70
|
|
|
76
71
|
def create(
|
|
77
72
|
self,
|
|
@@ -102,7 +97,7 @@ class AnnotationClient(ChildAPIClient):
|
|
|
102
97
|
"x2": x2,
|
|
103
98
|
"y2": y2,
|
|
104
99
|
}
|
|
105
|
-
response = self.client.post(self.api_path
|
|
100
|
+
response = self.client.post(f"{self.api_path}/", json=data)
|
|
106
101
|
return Annotation(
|
|
107
102
|
self.client, merge_dicts(response.json(), {"document": self.parent})
|
|
108
103
|
)
|
documentcloud/base.py
CHANGED
|
@@ -1,20 +1,14 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
1
|
# Standard Library
|
|
5
|
-
from builtins import str
|
|
6
2
|
from copy import copy
|
|
7
3
|
|
|
8
4
|
# Third Party
|
|
9
5
|
from dateutil.parser import parse as dateparser
|
|
10
|
-
from future.utils import python_2_unicode_compatible
|
|
11
6
|
|
|
12
7
|
# Local
|
|
13
8
|
from .exceptions import DuplicateObjectError
|
|
14
9
|
from .toolbox import get_id, merge_dicts
|
|
15
10
|
|
|
16
11
|
|
|
17
|
-
@python_2_unicode_compatible
|
|
18
12
|
class APIResults(object):
|
|
19
13
|
"""Class for encapsulating paginated list results from the API"""
|
|
20
14
|
|
|
@@ -39,10 +33,10 @@ class APIResults(object):
|
|
|
39
33
|
]
|
|
40
34
|
|
|
41
35
|
def __repr__(self):
|
|
42
|
-
return "<APIResults: {!r}"
|
|
36
|
+
return f"<APIResults: {self.results!r}>" # pragma: no cover
|
|
43
37
|
|
|
44
38
|
def __str__(self):
|
|
45
|
-
return "[{
|
|
39
|
+
return f"[{', '.join(str(r) for r in self.results)}]"
|
|
46
40
|
|
|
47
41
|
def __getitem__(self, key):
|
|
48
42
|
# pylint: disable=unsubscriptable-object
|
|
@@ -104,21 +98,19 @@ class BaseAPIClient(object):
|
|
|
104
98
|
params = {"expand": ",".join(expand)}
|
|
105
99
|
else:
|
|
106
100
|
params = {}
|
|
107
|
-
response = self.client.get(
|
|
108
|
-
"{}/{}/".format(self.api_path, get_id(id_)), params=params
|
|
109
|
-
)
|
|
101
|
+
response = self.client.get(f"{self.api_path}/{get_id(id_)}/", params=params)
|
|
110
102
|
# pylint: disable=not-callable
|
|
111
103
|
return self.resource(self.client, response.json())
|
|
112
104
|
|
|
113
105
|
def delete(self, id_):
|
|
114
106
|
"""Deletes a resource"""
|
|
115
|
-
self.client.delete("{}/{
|
|
107
|
+
self.client.delete(f"{self.api_path}/{get_id(id_)}")
|
|
116
108
|
|
|
117
109
|
def all(self, **params):
|
|
118
110
|
return self.list(**params)
|
|
119
111
|
|
|
120
112
|
def list(self, **params):
|
|
121
|
-
response = self.client.get(self.api_path
|
|
113
|
+
response = self.client.get(f"{self.api_path}/", params=params)
|
|
122
114
|
return APIResults(self.resource, self.client, response)
|
|
123
115
|
|
|
124
116
|
|
|
@@ -126,11 +118,11 @@ class ChildAPIClient(BaseAPIClient):
|
|
|
126
118
|
"""Base client for sub resources"""
|
|
127
119
|
|
|
128
120
|
def __init__(self, client, parent):
|
|
129
|
-
super(
|
|
121
|
+
super().__init__(client)
|
|
130
122
|
self.parent = parent
|
|
131
123
|
|
|
132
124
|
def list(self, **params):
|
|
133
|
-
response = self.client.get(self.api_path
|
|
125
|
+
response = self.client.get(f"{self.api_path}/", params=params)
|
|
134
126
|
parent_name = self.parent.__class__.__name__.lower()
|
|
135
127
|
return APIResults(
|
|
136
128
|
self.resource, self.client, response, {parent_name: self.parent}
|
|
@@ -156,9 +148,7 @@ class BaseAPIObject(object):
|
|
|
156
148
|
setattr(self, field, dateparser(getattr(self, field)))
|
|
157
149
|
|
|
158
150
|
def __repr__(self):
|
|
159
|
-
return "<{}: {} - {}>"
|
|
160
|
-
self.__class__.__name__, self.id, self
|
|
161
|
-
) # pragma: no cover
|
|
151
|
+
return f"<{self.__class__.__name__}: {self.id} - {self}>" # pragma: no cover
|
|
162
152
|
|
|
163
153
|
def __eq__(self, obj):
|
|
164
154
|
return isinstance(obj, type(self)) and self.id == obj.id
|
|
@@ -169,65 +159,56 @@ class BaseAPIObject(object):
|
|
|
169
159
|
|
|
170
160
|
def save(self):
|
|
171
161
|
data = {f: getattr(self, f) for f in self.writable_fields if hasattr(self, f)}
|
|
172
|
-
self._client.put("{}/{}/"
|
|
162
|
+
self._client.put(f"{self.api_path}/{self.id}/", json=data)
|
|
173
163
|
|
|
174
164
|
def delete(self):
|
|
175
|
-
self._client.delete("{}/{
|
|
165
|
+
self._client.delete(f"{self.api_path}/{self.id}")
|
|
176
166
|
|
|
177
167
|
|
|
178
|
-
@python_2_unicode_compatible
|
|
179
168
|
class APISet(list):
|
|
180
169
|
def __init__(self, iterable, resource):
|
|
181
|
-
super(
|
|
170
|
+
super().__init__(iterable)
|
|
182
171
|
self.resource = resource
|
|
183
172
|
if not all(isinstance(obj, self.resource) for obj in self):
|
|
184
173
|
raise TypeError(
|
|
185
|
-
"Only {} can be added to this list"
|
|
186
|
-
self.resource.__class__.__name__
|
|
187
|
-
)
|
|
174
|
+
f"Only {self.resource.__class__.__name__} can be added to this list"
|
|
188
175
|
)
|
|
189
176
|
ids = [obj.id for obj in self]
|
|
190
177
|
for id_ in ids:
|
|
191
178
|
if ids.count(id_) > 1:
|
|
192
179
|
raise DuplicateObjectError(
|
|
193
|
-
"Object with ID {} appears in the list more than once"
|
|
180
|
+
f"Object with ID {id_} appears in the list more than once"
|
|
194
181
|
)
|
|
195
182
|
|
|
196
183
|
def append(self, obj):
|
|
197
184
|
if not isinstance(obj, self.resource):
|
|
198
185
|
raise TypeError(
|
|
199
|
-
"Only {} can be added to this list"
|
|
200
|
-
self.resource.__class__.__name__
|
|
201
|
-
)
|
|
186
|
+
f"Only {self.resource.__class__.__name__} can be added to this list"
|
|
202
187
|
)
|
|
203
188
|
if obj.id in [i.id for i in self]:
|
|
204
189
|
raise DuplicateObjectError(
|
|
205
|
-
"Object with ID {} appears in the list more than once"
|
|
190
|
+
f"Object with ID {obj.id} appears in the list more than once"
|
|
206
191
|
)
|
|
207
|
-
super(
|
|
192
|
+
super().append(copy(obj))
|
|
208
193
|
|
|
209
194
|
def add(self, obj):
|
|
210
195
|
if not isinstance(obj, self.resource):
|
|
211
196
|
raise TypeError(
|
|
212
|
-
"Only {} can be added to this list"
|
|
213
|
-
self.resource.__class__.__name__
|
|
214
|
-
)
|
|
197
|
+
f"Only {self.resource.__class__.__name__} can be added to this list"
|
|
215
198
|
)
|
|
216
199
|
# skip duplicates silently
|
|
217
200
|
if obj.id not in [i.id for i in self]:
|
|
218
|
-
super(
|
|
201
|
+
super().append(copy(obj))
|
|
219
202
|
|
|
220
203
|
def extend(self, list_):
|
|
221
204
|
if not all(isinstance(obj, self.resource) for obj in list_):
|
|
222
205
|
raise TypeError(
|
|
223
|
-
"Only {} can be added to this list"
|
|
224
|
-
self.resource.__class__.__name__
|
|
225
|
-
)
|
|
206
|
+
f"Only {self.resource.__class__.__name__} can be added to this list"
|
|
226
207
|
)
|
|
227
208
|
ids = [obj.id for obj in self + list_]
|
|
228
209
|
for id_ in ids:
|
|
229
210
|
if ids.count(id_) > 1:
|
|
230
211
|
raise DuplicateObjectError(
|
|
231
|
-
"Object with ID {} appears in the list more than once"
|
|
212
|
+
f"Object with ID {id_} appears in the list more than once"
|
|
232
213
|
)
|
|
233
|
-
super(
|
|
214
|
+
super().extend(copy(obj) for obj in list_)
|
documentcloud/client.py
CHANGED
|
@@ -2,9 +2,6 @@
|
|
|
2
2
|
The public interface for the DocumentCloud API
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
# Future
|
|
6
|
-
from __future__ import division, print_function, unicode_literals
|
|
7
|
-
|
|
8
5
|
# Standard Library
|
|
9
6
|
import logging
|
|
10
7
|
from functools import partial
|
|
@@ -84,20 +81,18 @@ class DocumentCloud(object):
|
|
|
84
81
|
access_token = None
|
|
85
82
|
|
|
86
83
|
if access_token:
|
|
87
|
-
self.session.headers.update(
|
|
88
|
-
{"Authorization": "Bearer {}".format(access_token)}
|
|
89
|
-
)
|
|
84
|
+
self.session.headers.update({"Authorization": f"Bearer {access_token}"})
|
|
90
85
|
|
|
91
86
|
def _get_tokens(self, username, password):
|
|
92
87
|
"""Get an access and refresh token in exchange for the username and password"""
|
|
93
88
|
response = requests_retry_session().post(
|
|
94
|
-
"{}token/"
|
|
89
|
+
f"{self.auth_uri}token/",
|
|
95
90
|
json={"username": username, "password": password},
|
|
96
91
|
timeout=self.timeout,
|
|
97
92
|
)
|
|
98
93
|
|
|
99
94
|
if response.status_code == requests.codes.UNAUTHORIZED:
|
|
100
|
-
raise CredentialsFailedError("The username and password
|
|
95
|
+
raise CredentialsFailedError("The username and password are incorrect")
|
|
101
96
|
|
|
102
97
|
self.raise_for_status(response)
|
|
103
98
|
|
|
@@ -107,7 +102,7 @@ class DocumentCloud(object):
|
|
|
107
102
|
def _refresh_tokens(self, refresh_token):
|
|
108
103
|
"""Refresh the access and refresh tokens"""
|
|
109
104
|
response = requests_retry_session().post(
|
|
110
|
-
"{}refresh/"
|
|
105
|
+
f"{self.auth_uri}refresh/",
|
|
111
106
|
json={"refresh": refresh_token},
|
|
112
107
|
timeout=self.timeout,
|
|
113
108
|
)
|
|
@@ -136,7 +131,7 @@ class DocumentCloud(object):
|
|
|
136
131
|
full_url = kwargs.pop("full_url", False)
|
|
137
132
|
|
|
138
133
|
if not full_url:
|
|
139
|
-
url = "{}{}"
|
|
134
|
+
url = f"{self.base_uri}{url}"
|
|
140
135
|
|
|
141
136
|
# set the API to version 2.0
|
|
142
137
|
parsed_url = urlparse(url)
|
|
@@ -165,7 +160,7 @@ class DocumentCloud(object):
|
|
|
165
160
|
if attr in methods:
|
|
166
161
|
return partial(self._request, attr)
|
|
167
162
|
raise AttributeError(
|
|
168
|
-
"'{}' object has no attribute '{}'"
|
|
163
|
+
f"'{self.__class__.__name__}' object has no attribute '{attr}'"
|
|
169
164
|
)
|
|
170
165
|
|
|
171
166
|
def raise_for_status(self, response):
|
|
@@ -174,6 +169,6 @@ class DocumentCloud(object):
|
|
|
174
169
|
response.raise_for_status()
|
|
175
170
|
except requests.exceptions.RequestException as exc:
|
|
176
171
|
if exc.response.status_code == 404:
|
|
177
|
-
raise DoesNotExistError(response=exc.response)
|
|
172
|
+
raise DoesNotExistError(response=exc.response) from exc
|
|
178
173
|
else:
|
|
179
|
-
raise APIError(response=exc.response)
|
|
174
|
+
raise APIError(response=exc.response) from exc
|
documentcloud/constants.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
1
|
PER_PAGE_MAX = 100
|
|
5
2
|
BULK_LIMIT = 25
|
|
6
3
|
BASE_URI = "https://api.www.documentcloud.org/api/"
|
|
@@ -9,96 +6,96 @@ TIMEOUT = 20
|
|
|
9
6
|
RATE_LIMIT = 10
|
|
10
7
|
RATE_PERIOD = 1
|
|
11
8
|
SUPPORTED_EXTENSIONS = [
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
]
|
|
9
|
+
".abw",
|
|
10
|
+
".zabw",
|
|
11
|
+
".md",
|
|
12
|
+
".pm3",
|
|
13
|
+
".pm4",
|
|
14
|
+
".pm5",
|
|
15
|
+
".pm6",
|
|
16
|
+
".p65",
|
|
17
|
+
".cwk",
|
|
18
|
+
".agd",
|
|
19
|
+
".fhd",
|
|
20
|
+
".kth",
|
|
21
|
+
".key",
|
|
22
|
+
".numbers",
|
|
23
|
+
".pages",
|
|
24
|
+
".bmp",
|
|
25
|
+
".csv",
|
|
26
|
+
".txt",
|
|
27
|
+
".cdr",
|
|
28
|
+
".cmx",
|
|
29
|
+
".cgm",
|
|
30
|
+
".dif",
|
|
31
|
+
".dbf",
|
|
32
|
+
".xml",
|
|
33
|
+
".eps",
|
|
34
|
+
".emf",
|
|
35
|
+
".fb2",
|
|
36
|
+
".gnm",
|
|
37
|
+
".gnumeric",
|
|
38
|
+
".gif",
|
|
39
|
+
".hwp",
|
|
40
|
+
".plt",
|
|
41
|
+
".html",
|
|
42
|
+
".htm",
|
|
43
|
+
".jtd",
|
|
44
|
+
".jtt",
|
|
45
|
+
".jpg",
|
|
46
|
+
".jpeg",
|
|
47
|
+
".wk1",
|
|
48
|
+
".wks",
|
|
49
|
+
".123",
|
|
50
|
+
".wk3",
|
|
51
|
+
".wk4",
|
|
52
|
+
".pct",
|
|
53
|
+
".mml",
|
|
54
|
+
".xls",
|
|
55
|
+
".xlw",
|
|
56
|
+
".xlt",
|
|
57
|
+
".xlsx",
|
|
58
|
+
".docx",
|
|
59
|
+
".pptx",
|
|
60
|
+
".ppt",
|
|
61
|
+
".pps",
|
|
62
|
+
".pot",
|
|
63
|
+
".pptx",
|
|
64
|
+
".pub",
|
|
65
|
+
".rtf",
|
|
66
|
+
".xml",
|
|
67
|
+
".doc",
|
|
68
|
+
".dot",
|
|
69
|
+
".docx",
|
|
70
|
+
".wps",
|
|
71
|
+
".wks",
|
|
72
|
+
".wdb",
|
|
73
|
+
".wri",
|
|
74
|
+
".vsd",
|
|
75
|
+
".pgm",
|
|
76
|
+
".pbm",
|
|
77
|
+
".ppm",
|
|
78
|
+
".odt",
|
|
79
|
+
".fodt",
|
|
80
|
+
".ods",
|
|
81
|
+
".fods",
|
|
82
|
+
".odp",
|
|
83
|
+
".fodp",
|
|
84
|
+
".odg",
|
|
85
|
+
".fodg",
|
|
86
|
+
".odf",
|
|
87
|
+
".odb",
|
|
88
|
+
".sxw",
|
|
89
|
+
".stw",
|
|
90
|
+
".sxc",
|
|
91
|
+
".stc",
|
|
92
|
+
".sxi",
|
|
93
|
+
".sti",
|
|
94
|
+
".sxd",
|
|
95
|
+
".std",
|
|
96
|
+
".sxm",
|
|
97
|
+
".pcx",
|
|
98
|
+
".pcd",
|
|
99
|
+
".psd",
|
|
100
|
+
".pdf",
|
|
101
|
+
]
|
documentcloud/documents.py
CHANGED
|
@@ -2,19 +2,15 @@
|
|
|
2
2
|
Documents
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
# Future
|
|
6
|
-
from __future__ import division, print_function, unicode_literals
|
|
7
|
-
|
|
8
5
|
# Standard Library
|
|
6
|
+
import datetime
|
|
9
7
|
import logging
|
|
10
8
|
import os
|
|
11
9
|
import re
|
|
12
10
|
import warnings
|
|
13
|
-
import datetime
|
|
14
11
|
from functools import partial
|
|
15
12
|
|
|
16
13
|
# Third Party
|
|
17
|
-
from future.utils import python_2_unicode_compatible
|
|
18
14
|
from requests.exceptions import RequestException
|
|
19
15
|
|
|
20
16
|
# Local
|
|
@@ -32,11 +28,11 @@ try:
|
|
|
32
28
|
except ImportError:
|
|
33
29
|
from urlparse import urlparse
|
|
34
30
|
|
|
35
|
-
|
|
36
31
|
logger = logging.getLogger("documentcloud")
|
|
37
32
|
|
|
33
|
+
IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]
|
|
34
|
+
|
|
38
35
|
|
|
39
|
-
@python_2_unicode_compatible
|
|
40
36
|
class Document(BaseAPIObject):
|
|
41
37
|
"""A single DocumentCloud document"""
|
|
42
38
|
|
|
@@ -60,13 +56,13 @@ class Document(BaseAPIObject):
|
|
|
60
56
|
for name, resource in objs:
|
|
61
57
|
value = dict_.get(name)
|
|
62
58
|
if isinstance(value, dict):
|
|
63
|
-
dict_["_"
|
|
64
|
-
dict_[name
|
|
59
|
+
dict_[f"_{name}"] = resource(client, value)
|
|
60
|
+
dict_[f"{name}_id"] = value.get("id")
|
|
65
61
|
elif isinstance(value, int):
|
|
66
|
-
dict_["_"
|
|
67
|
-
dict_[name
|
|
62
|
+
dict_[f"_{name}"] = None
|
|
63
|
+
dict_[f"{name}_id"] = value
|
|
68
64
|
|
|
69
|
-
super(
|
|
65
|
+
super().__init__(client, dict_)
|
|
70
66
|
|
|
71
67
|
self.sections = SectionClient(client, self)
|
|
72
68
|
self.annotations = AnnotationClient(client, self)
|
|
@@ -87,13 +83,13 @@ class Document(BaseAPIObject):
|
|
|
87
83
|
fmt = "json" if json else "text" if text else None
|
|
88
84
|
# this allows dropping `get_` to act like a property, ie
|
|
89
85
|
# .full_text_url
|
|
90
|
-
if not get and hasattr(self, "get_{}"
|
|
91
|
-
return getattr(self, "get_{}"
|
|
86
|
+
if not get and hasattr(self, f"get_{attr}"):
|
|
87
|
+
return getattr(self, f"get_{attr}")()
|
|
92
88
|
# this allows dropping `_url` to fetch the url, ie
|
|
93
89
|
# .get_full_text()
|
|
94
|
-
if not url and hasattr(self, "{}_url"
|
|
90
|
+
if not url and hasattr(self, f"{attr}_url"):
|
|
95
91
|
return lambda *a, **k: self._get_url(
|
|
96
|
-
getattr(self, "{}_url"
|
|
92
|
+
getattr(self, f"{attr}_url")(*a, **k), fmt
|
|
97
93
|
)
|
|
98
94
|
# this genericizes the image sizes
|
|
99
95
|
m_image = p_image.match(attr)
|
|
@@ -102,7 +98,7 @@ class Document(BaseAPIObject):
|
|
|
102
98
|
if m_image and not m_image.group("list"):
|
|
103
99
|
return partial(self.get_image_url, size=m_image.group("size"))
|
|
104
100
|
raise AttributeError(
|
|
105
|
-
"'{}' object has no attribute '{}'"
|
|
101
|
+
f"'{self.__class__.__name__}' object has no attribute '{attr}'"
|
|
106
102
|
)
|
|
107
103
|
|
|
108
104
|
def __dir__(self):
|
|
@@ -111,14 +107,14 @@ class Document(BaseAPIObject):
|
|
|
111
107
|
attrs += [a[len("get_") :] for a in getters]
|
|
112
108
|
attrs += [a[: -len("_url")] for a in getters if a.endswith("url")]
|
|
113
109
|
attrs += [a[len("get_") : -len("_url")] for a in getters if a.endswith("url")]
|
|
114
|
-
for size in
|
|
110
|
+
for size in IMAGE_SIZES:
|
|
115
111
|
attrs += [
|
|
116
|
-
"get_{}_image_url"
|
|
117
|
-
"{}_image_url"
|
|
118
|
-
"get_{}_image"
|
|
119
|
-
"{}_image"
|
|
120
|
-
"get_{}_image_url_list"
|
|
121
|
-
"{}_image_url_list"
|
|
112
|
+
f"get_{size}_image_url",
|
|
113
|
+
f"{size}_image_url",
|
|
114
|
+
f"get_{size}_image",
|
|
115
|
+
f"{size}_image",
|
|
116
|
+
f"get_{size}_image_url_list",
|
|
117
|
+
f"{size}_image_url_list",
|
|
122
118
|
]
|
|
123
119
|
return sorted(attrs)
|
|
124
120
|
|
|
@@ -185,27 +181,26 @@ class Document(BaseAPIObject):
|
|
|
185
181
|
|
|
186
182
|
# Resource URLs
|
|
187
183
|
def get_full_text_url(self):
|
|
188
|
-
return "{}documents/{}/{}.txt"
|
|
184
|
+
return f"{self.asset_url}documents/{self.id}/{self.slug}.txt"
|
|
189
185
|
|
|
190
186
|
def get_page_text_url(self, page=1):
|
|
191
|
-
return "{}documents/{}/pages/{}-p{}.txt"
|
|
192
|
-
self.asset_url, self.id, self.slug, page
|
|
193
|
-
)
|
|
187
|
+
return f"{self.asset_url}documents/{self.id}/pages/{self.slug}-p{page}.txt"
|
|
194
188
|
|
|
195
189
|
def get_page_position_json_url(self, page=1):
|
|
196
|
-
return
|
|
197
|
-
self.asset_url
|
|
190
|
+
return (
|
|
191
|
+
f"{self.asset_url}documents/{self.id}/pages/"
|
|
192
|
+
f"{self.slug}-p{page}.position.json"
|
|
198
193
|
)
|
|
199
194
|
|
|
200
195
|
def get_json_text_url(self):
|
|
201
|
-
return "{}documents/{}/{}.txt.json"
|
|
196
|
+
return f"{self.asset_url}documents/{self.id}/{self.slug}.txt.json"
|
|
202
197
|
|
|
203
198
|
def get_pdf_url(self):
|
|
204
|
-
return "{}documents/{}/{}.pdf"
|
|
199
|
+
return f"{self.asset_url}documents/{self.id}/{self.slug}.pdf"
|
|
205
200
|
|
|
206
201
|
def get_image_url(self, page=1, size="normal"):
|
|
207
|
-
return
|
|
208
|
-
self.asset_url
|
|
202
|
+
return (
|
|
203
|
+
f"{self.asset_url}documents/{self.id}/pages/{self.slug}-p{page}-{size}.gif"
|
|
209
204
|
)
|
|
210
205
|
|
|
211
206
|
def get_image_url_list(self, size="normal"):
|
|
@@ -215,27 +210,29 @@ class Document(BaseAPIObject):
|
|
|
215
210
|
|
|
216
211
|
def get_errors(self):
|
|
217
212
|
"""Retrieve errors for the document"""
|
|
218
|
-
endpoint = "documents/{}/errors/"
|
|
213
|
+
endpoint = f"documents/{self.id}/errors/"
|
|
219
214
|
all_results = []
|
|
220
215
|
|
|
221
216
|
while endpoint:
|
|
222
217
|
response = self._client.get(endpoint)
|
|
223
218
|
data = response.json()
|
|
224
219
|
|
|
225
|
-
results = data.get(
|
|
220
|
+
results = data.get("results", [])
|
|
226
221
|
for entry in results:
|
|
227
|
-
created_at_str = entry.get(
|
|
222
|
+
created_at_str = entry.get("created_at")
|
|
228
223
|
if created_at_str:
|
|
229
|
-
entry[
|
|
224
|
+
entry["created_at"] = datetime.datetime.strptime(
|
|
225
|
+
created_at_str, "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
226
|
+
)
|
|
230
227
|
|
|
231
228
|
all_results.extend(results)
|
|
232
|
-
endpoint = data.get(
|
|
229
|
+
endpoint = data.get("next")
|
|
233
230
|
|
|
234
231
|
return all_results
|
|
235
|
-
|
|
232
|
+
|
|
236
233
|
def process(self):
|
|
237
234
|
"""Reprocess the document"""
|
|
238
|
-
self._client.post("{}/{}/process/"
|
|
235
|
+
self._client.post(f"{self.api_path}/{self.id}/process/")
|
|
239
236
|
|
|
240
237
|
|
|
241
238
|
class DocumentClient(BaseAPIClient):
|
|
@@ -271,6 +268,16 @@ class DocumentClient(BaseAPIClient):
|
|
|
271
268
|
|
|
272
269
|
def upload(self, pdf, **kwargs):
|
|
273
270
|
"""Upload a document"""
|
|
271
|
+
|
|
272
|
+
def check_size(size):
|
|
273
|
+
# DocumentCloud's size limit is set to 501MB to give people a little leeway
|
|
274
|
+
# for OS rounding
|
|
275
|
+
if size >= 501 * 1024 * 1024:
|
|
276
|
+
raise ValueError(
|
|
277
|
+
"The pdf you have submitted is over the DocumentCloud API's 500MB "
|
|
278
|
+
"file size limit. Split it into smaller pieces and try again."
|
|
279
|
+
)
|
|
280
|
+
|
|
274
281
|
# if they pass in a URL, use the URL upload flow
|
|
275
282
|
if is_url(pdf):
|
|
276
283
|
return self._upload_url(pdf, **kwargs)
|
|
@@ -281,19 +288,13 @@ class DocumentClient(BaseAPIClient):
|
|
|
281
288
|
size = os.fstat(pdf.fileno()).st_size
|
|
282
289
|
except (AttributeError, OSError): # pragma: no cover
|
|
283
290
|
size = 0
|
|
291
|
+
check_size(size)
|
|
292
|
+
return self._upload_file(pdf, **kwargs)
|
|
284
293
|
else:
|
|
285
294
|
size = os.path.getsize(pdf)
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
# for OS rounding
|
|
290
|
-
if size >= 501 * 1024 * 1024:
|
|
291
|
-
raise ValueError(
|
|
292
|
-
"The pdf you have submitted is over the DocumentCloud API's 500MB "
|
|
293
|
-
"file size limit. Split it into smaller pieces and try again."
|
|
294
|
-
)
|
|
295
|
-
|
|
296
|
-
return self._upload_file(pdf, **kwargs)
|
|
295
|
+
check_size(size)
|
|
296
|
+
with open(pdf, "rb") as pdf_file:
|
|
297
|
+
return self._upload_file(pdf_file, **kwargs)
|
|
297
298
|
|
|
298
299
|
def _format_upload_parameters(self, name, **kwargs):
|
|
299
300
|
"""Prepare upload parameters from kwargs"""
|
|
@@ -327,9 +328,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
327
328
|
|
|
328
329
|
for param in ignored_parameters:
|
|
329
330
|
if param in kwargs:
|
|
330
|
-
warnings.warn(
|
|
331
|
-
"The parameter `{}` is not currently supported".format(param)
|
|
332
|
-
)
|
|
331
|
+
warnings.warn(f"The parameter `{param}` is not currently supported")
|
|
333
332
|
|
|
334
333
|
return params
|
|
335
334
|
|
|
@@ -359,7 +358,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
359
358
|
# begin processing the document
|
|
360
359
|
doc_id = create_json["id"]
|
|
361
360
|
response = self.client.post(
|
|
362
|
-
"documents/{}/process/"
|
|
361
|
+
f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
|
|
363
362
|
)
|
|
364
363
|
|
|
365
364
|
return Document(self.client, create_json)
|
|
@@ -379,11 +378,13 @@ class DocumentClient(BaseAPIClient):
|
|
|
379
378
|
|
|
380
379
|
def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
|
|
381
380
|
"""Upload files with specified extensions in a directory"""
|
|
381
|
+
# pylint: disable=too-many-locals, too-many-branches
|
|
382
382
|
|
|
383
383
|
# Do not set the same title for all documents
|
|
384
384
|
kwargs.pop("title", None)
|
|
385
385
|
|
|
386
|
-
# If extensions
|
|
386
|
+
# If extensions are specified as None, it will check for all supported
|
|
387
|
+
# filetypes.
|
|
387
388
|
if extensions is None:
|
|
388
389
|
extensions = SUPPORTED_EXTENSIONS
|
|
389
390
|
|
|
@@ -402,7 +403,9 @@ class DocumentClient(BaseAPIClient):
|
|
|
402
403
|
path_list = self._collect_files(path, extensions)
|
|
403
404
|
|
|
404
405
|
logger.info(
|
|
405
|
-
"Upload directory on %s: Found %d files to upload",
|
|
406
|
+
"Upload directory on %s: Found %d files to upload",
|
|
407
|
+
path,
|
|
408
|
+
len(path_list)
|
|
406
409
|
)
|
|
407
410
|
|
|
408
411
|
# Upload all the files using the bulk API to reduce the number
|
|
@@ -413,7 +416,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
413
416
|
# Grouper will put None's on the end of the last group
|
|
414
417
|
file_paths = [p for p in file_paths if p is not None]
|
|
415
418
|
|
|
416
|
-
logger.info("Uploading group %d
|
|
419
|
+
logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
|
|
417
420
|
|
|
418
421
|
# Create the documents
|
|
419
422
|
logger.info("Creating the documents...")
|
|
@@ -438,9 +441,9 @@ class DocumentClient(BaseAPIClient):
|
|
|
438
441
|
except (APIError, RequestException) as exc:
|
|
439
442
|
if handle_errors:
|
|
440
443
|
logger.info(
|
|
441
|
-
"Error creating the following documents: %s
|
|
444
|
+
"Error creating the following documents: %s\n%s",
|
|
442
445
|
exc,
|
|
443
|
-
"\n".join(file_paths)
|
|
446
|
+
"\n".join(file_paths)
|
|
444
447
|
)
|
|
445
448
|
continue
|
|
446
449
|
else:
|
|
@@ -453,16 +456,15 @@ class DocumentClient(BaseAPIClient):
|
|
|
453
456
|
for url, file_path in zip(presigned_urls, file_paths):
|
|
454
457
|
logger.info("Uploading %s to S3...", file_path)
|
|
455
458
|
try:
|
|
456
|
-
|
|
457
|
-
url, data=
|
|
458
|
-
)
|
|
459
|
+
with open(file_path, "rb") as file:
|
|
460
|
+
response = requests_retry_session().put(url, data=file.read())
|
|
459
461
|
self.client.raise_for_status(response)
|
|
460
462
|
except (APIError, RequestException) as exc:
|
|
461
463
|
if handle_errors:
|
|
462
464
|
logger.info(
|
|
463
465
|
"Error uploading the following document: %s %s",
|
|
464
466
|
exc,
|
|
465
|
-
file_path
|
|
467
|
+
file_path
|
|
466
468
|
)
|
|
467
469
|
continue
|
|
468
470
|
else:
|
|
@@ -476,9 +478,9 @@ class DocumentClient(BaseAPIClient):
|
|
|
476
478
|
except (APIError, RequestException) as exc:
|
|
477
479
|
if handle_errors:
|
|
478
480
|
logger.info(
|
|
479
|
-
"Error creating the following documents: %s
|
|
481
|
+
"Error creating the following documents: %s\n%s",
|
|
480
482
|
exc,
|
|
481
|
-
"\n".join(file_paths)
|
|
483
|
+
"\n".join(file_paths)
|
|
482
484
|
)
|
|
483
485
|
continue
|
|
484
486
|
else:
|
|
@@ -501,7 +503,11 @@ class DocumentClient(BaseAPIClient):
|
|
|
501
503
|
# Grouper will put None's on the end of the last group
|
|
502
504
|
url_group = [url for url in url_group if url is not None]
|
|
503
505
|
|
|
504
|
-
logger.info(
|
|
506
|
+
logger.info(
|
|
507
|
+
"Uploading group %d: %s",
|
|
508
|
+
i + 1,
|
|
509
|
+
"\n".join(url_group)
|
|
510
|
+
)
|
|
505
511
|
|
|
506
512
|
# Create the documents
|
|
507
513
|
logger.info("Creating the documents...")
|
|
@@ -522,9 +528,9 @@ class DocumentClient(BaseAPIClient):
|
|
|
522
528
|
except (APIError, RequestException) as exc:
|
|
523
529
|
if handle_errors:
|
|
524
530
|
logger.info(
|
|
525
|
-
"Error creating the following documents: %s
|
|
526
|
-
exc,
|
|
527
|
-
"\n".join(url_group)
|
|
531
|
+
"Error creating the following documents: %s\n%s",
|
|
532
|
+
str(exc),
|
|
533
|
+
"\n".join(url_group)
|
|
528
534
|
)
|
|
529
535
|
continue
|
|
530
536
|
else:
|
|
@@ -538,7 +544,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
538
544
|
# Pass back the list of documents
|
|
539
545
|
return [Document(self.client, d) for d in obj_list]
|
|
540
546
|
|
|
541
|
-
|
|
547
|
+
|
|
542
548
|
class Mention:
|
|
543
549
|
"""A snippet from a document search"""
|
|
544
550
|
|
|
@@ -549,7 +555,7 @@ class Mention:
|
|
|
549
555
|
self.text = text
|
|
550
556
|
|
|
551
557
|
def __repr__(self):
|
|
552
|
-
return "<{}: {}>"
|
|
558
|
+
return f"<{self.__class__.__name__}: {self}>" # pragma: no cover
|
|
553
559
|
|
|
554
560
|
def __str__(self):
|
|
555
|
-
return '{} - "{}"'
|
|
561
|
+
return f'{self.page} - "{self.text}"'
|
documentcloud/exceptions.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Custom exceptions for python-documentcloud
|
|
3
3
|
"""
|
|
4
|
-
# Future
|
|
5
|
-
from __future__ import division, print_function, unicode_literals
|
|
6
4
|
|
|
7
5
|
|
|
8
6
|
class DocumentCloudError(Exception):
|
|
@@ -14,11 +12,11 @@ class DocumentCloudError(Exception):
|
|
|
14
12
|
self.error = self.response.text
|
|
15
13
|
self.status_code = self.response.status_code
|
|
16
14
|
if not args:
|
|
17
|
-
args = ["{} - {
|
|
15
|
+
args = [f"{self.status_code} - {self.error}"]
|
|
18
16
|
else:
|
|
19
17
|
self.error = None
|
|
20
18
|
self.status_code = None
|
|
21
|
-
super(
|
|
19
|
+
super().__init__(*args, **kwargs)
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
class DuplicateObjectError(DocumentCloudError):
|
documentcloud/organizations.py
CHANGED
|
@@ -1,14 +1,7 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
|
-
|
|
7
1
|
# Local
|
|
8
2
|
from .base import BaseAPIClient, BaseAPIObject
|
|
9
3
|
|
|
10
4
|
|
|
11
|
-
@python_2_unicode_compatible
|
|
12
5
|
class Organization(BaseAPIObject):
|
|
13
6
|
"""A documentcloud organization"""
|
|
14
7
|
|
documentcloud/projects.py
CHANGED
|
@@ -1,9 +1,3 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
|
-
|
|
7
1
|
# Local
|
|
8
2
|
from .base import APISet, BaseAPIClient, BaseAPIObject
|
|
9
3
|
from .constants import BULK_LIMIT, PER_PAGE_MAX
|
|
@@ -12,7 +6,6 @@ from .exceptions import DoesNotExistError, MultipleObjectsReturnedError
|
|
|
12
6
|
from .toolbox import get_id, grouper
|
|
13
7
|
|
|
14
8
|
|
|
15
|
-
@python_2_unicode_compatible
|
|
16
9
|
class Project(BaseAPIObject):
|
|
17
10
|
"""A documentcloud project"""
|
|
18
11
|
|
|
@@ -21,7 +14,7 @@ class Project(BaseAPIObject):
|
|
|
21
14
|
|
|
22
15
|
def __init__(self, *args, **kwargs):
|
|
23
16
|
per_page = kwargs.pop("per_page", PER_PAGE_MAX)
|
|
24
|
-
super(
|
|
17
|
+
super().__init__(*args, **kwargs)
|
|
25
18
|
self._document_list = None
|
|
26
19
|
self._per_page = per_page
|
|
27
20
|
|
|
@@ -30,7 +23,7 @@ class Project(BaseAPIObject):
|
|
|
30
23
|
|
|
31
24
|
def save(self):
|
|
32
25
|
"""Add the documents to the project as well"""
|
|
33
|
-
super(
|
|
26
|
+
super().save()
|
|
34
27
|
if self._document_list:
|
|
35
28
|
self.clear_documents()
|
|
36
29
|
self.add_documents(self._document_list)
|
|
@@ -39,7 +32,7 @@ class Project(BaseAPIObject):
|
|
|
39
32
|
def document_list(self):
|
|
40
33
|
if self._document_list is None:
|
|
41
34
|
response = self._client.get(
|
|
42
|
-
"{}/{
|
|
35
|
+
f"{self.api_path}/{get_id(self.id)}/documents/",
|
|
43
36
|
params={"per_page": self._per_page, "expand": ["document"]},
|
|
44
37
|
)
|
|
45
38
|
json = response.json()
|
|
@@ -78,16 +71,14 @@ class Project(BaseAPIObject):
|
|
|
78
71
|
|
|
79
72
|
def get_document(self, doc_id):
|
|
80
73
|
response = self._client.get(
|
|
81
|
-
"{}/{}/documents/{}"
|
|
74
|
+
f"{self.api_path}/{get_id(self.id)}/documents/{doc_id}",
|
|
82
75
|
params={"expand": ["document"]},
|
|
83
76
|
)
|
|
84
77
|
return Document(self._client, response.json()["document"])
|
|
85
78
|
|
|
86
79
|
def clear_documents(self):
|
|
87
80
|
"""Remove all documents from this project"""
|
|
88
|
-
self._client.put(
|
|
89
|
-
"{}/{}/documents/".format(self.api_path, self.id), json=[]
|
|
90
|
-
)
|
|
81
|
+
self._client.put(f"{self.api_path}/{self.id}/documents/", json=[])
|
|
91
82
|
|
|
92
83
|
def add_documents(self, documents):
|
|
93
84
|
"""Efficient way to bulk add documents to a project"""
|
|
@@ -95,9 +86,7 @@ class Project(BaseAPIObject):
|
|
|
95
86
|
for data_group in grouper(data, BULK_LIMIT):
|
|
96
87
|
# Grouper will put None's on the end of the last group
|
|
97
88
|
data_group = [d for d in data_group if d is not None]
|
|
98
|
-
self._client.patch(
|
|
99
|
-
"{}/{}/documents/".format(self.api_path, self.id), json=data_group
|
|
100
|
-
)
|
|
89
|
+
self._client.patch(f"{self.api_path}/{self.id}/documents/", json=data_group)
|
|
101
90
|
|
|
102
91
|
|
|
103
92
|
class ProjectClient(BaseAPIClient):
|
|
@@ -106,12 +95,12 @@ class ProjectClient(BaseAPIClient):
|
|
|
106
95
|
api_path = "projects"
|
|
107
96
|
resource = Project
|
|
108
97
|
|
|
109
|
-
# all is
|
|
98
|
+
# all is overridden to filter by the current user for backward compatibility
|
|
110
99
|
def all(self, **params):
|
|
111
100
|
return self.list(user=self.client.user_id, **params)
|
|
112
101
|
|
|
113
102
|
def get(self, id=None, title=None):
|
|
114
|
-
# pylint:disable=redefined-builtin, arguments-
|
|
103
|
+
# pylint:disable=redefined-builtin, arguments-renamed
|
|
115
104
|
# pylint disables are necessary for backward compatibility
|
|
116
105
|
if id is not None and title is not None:
|
|
117
106
|
raise ValueError(
|
|
@@ -126,11 +115,11 @@ class ProjectClient(BaseAPIClient):
|
|
|
126
115
|
return self.get_by_title(title)
|
|
127
116
|
|
|
128
117
|
def get_by_id(self, id_):
|
|
129
|
-
return super(
|
|
118
|
+
return super().get(id_)
|
|
130
119
|
|
|
131
120
|
def get_by_title(self, title):
|
|
132
121
|
response = self.client.get(
|
|
133
|
-
self.api_path
|
|
122
|
+
f"{self.api_path}/", params={"title": title, "user": self.client.user_id}
|
|
134
123
|
)
|
|
135
124
|
json = response.json()
|
|
136
125
|
count = len(json["results"])
|
|
@@ -148,7 +137,7 @@ class ProjectClient(BaseAPIClient):
|
|
|
148
137
|
if document_ids:
|
|
149
138
|
data = [{"document": d} for d in document_ids]
|
|
150
139
|
response = self.client.put(
|
|
151
|
-
"{}/{}/documents/"
|
|
140
|
+
f"{self.api_path}/{project.id}/documents/", json=data
|
|
152
141
|
)
|
|
153
142
|
return project
|
|
154
143
|
|
documentcloud/sections.py
CHANGED
|
@@ -1,26 +1,19 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
|
-
|
|
7
1
|
# Local
|
|
8
2
|
from .base import BaseAPIObject, ChildAPIClient
|
|
9
3
|
from .toolbox import merge_dicts
|
|
10
4
|
|
|
11
5
|
|
|
12
|
-
@python_2_unicode_compatible
|
|
13
6
|
class Section(BaseAPIObject):
|
|
14
7
|
"""A section of a document"""
|
|
15
8
|
|
|
16
9
|
writable_fields = ["page_number", "title"]
|
|
17
10
|
|
|
18
11
|
def __str__(self):
|
|
19
|
-
return "{} - p{
|
|
12
|
+
return f"{self.title} - p{self.page}"
|
|
20
13
|
|
|
21
14
|
@property
|
|
22
15
|
def api_path(self):
|
|
23
|
-
return "documents/{
|
|
16
|
+
return f"documents/{self.document.id}/sections"
|
|
24
17
|
|
|
25
18
|
@property
|
|
26
19
|
def page(self):
|
|
@@ -34,11 +27,11 @@ class SectionClient(ChildAPIClient):
|
|
|
34
27
|
|
|
35
28
|
@property
|
|
36
29
|
def api_path(self):
|
|
37
|
-
return "documents/{
|
|
30
|
+
return f"documents/{self.parent.id}/sections"
|
|
38
31
|
|
|
39
32
|
def create(self, title, page_number):
|
|
40
33
|
data = {"title": title, "page_number": page_number}
|
|
41
|
-
response = self.client.post(self.api_path
|
|
34
|
+
response = self.client.post(f"{self.api_path}/", json=data)
|
|
42
35
|
return Section(
|
|
43
36
|
self.client, merge_dicts(response.json(), {"document": self.parent})
|
|
44
37
|
)
|
documentcloud/toolbox.py
CHANGED
|
@@ -1,21 +1,16 @@
|
|
|
1
1
|
"""
|
|
2
2
|
A few toys the API will use.
|
|
3
3
|
"""
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
|
|
5
|
+
# Standard Library
|
|
6
|
+
from itertools import zip_longest
|
|
7
|
+
from urllib.parse import urlparse
|
|
6
8
|
|
|
7
9
|
# Third Party
|
|
8
10
|
import requests
|
|
9
11
|
from requests.adapters import HTTPAdapter
|
|
10
12
|
from urllib3.util.retry import Retry
|
|
11
13
|
|
|
12
|
-
try:
|
|
13
|
-
from urllib.parse import urlparse
|
|
14
|
-
from itertools import zip_longest
|
|
15
|
-
except ImportError:
|
|
16
|
-
from urlparse import urlparse
|
|
17
|
-
from itertools import izip_longest as zip_longest
|
|
18
|
-
|
|
19
14
|
|
|
20
15
|
def requests_retry_session(
|
|
21
16
|
retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None
|
documentcloud/users.py
CHANGED
|
@@ -1,14 +1,7 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
|
-
|
|
7
1
|
# Local
|
|
8
2
|
from .base import BaseAPIClient, BaseAPIObject
|
|
9
3
|
|
|
10
4
|
|
|
11
|
-
@python_2_unicode_compatible
|
|
12
5
|
class User(BaseAPIObject):
|
|
13
6
|
"""A documentcloud user"""
|
|
14
7
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: python-documentcloud
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.0
|
|
4
4
|
Summary: A simple Python wrapper for the DocumentCloud API
|
|
5
5
|
Home-page: https://github.com/muckrock/python-documentcloud
|
|
6
6
|
Author: Mitchell Kotler
|
|
@@ -11,24 +11,23 @@ Classifier: Development Status :: 5 - Production/Stable
|
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Operating System :: OS Independent
|
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
-
Classifier: Programming Language :: Python
|
|
15
|
-
Classifier: Programming Language :: Python :: 2
|
|
16
|
-
Classifier: Programming Language :: Python :: 2.7
|
|
17
|
-
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.6
|
|
19
14
|
Classifier: Programming Language :: Python :: 3.7
|
|
20
15
|
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
20
|
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
21
|
Description-Content-Type: text/markdown
|
|
23
22
|
License-File: LICENSE
|
|
23
|
+
Requires-Dist: fastjsonschema
|
|
24
24
|
Requires-Dist: future
|
|
25
25
|
Requires-Dist: listcrunch (>=1.0.1)
|
|
26
26
|
Requires-Dist: python-dateutil
|
|
27
|
+
Requires-Dist: pyyaml
|
|
27
28
|
Requires-Dist: ratelimit
|
|
28
29
|
Requires-Dist: requests
|
|
29
30
|
Requires-Dist: urllib3
|
|
30
|
-
Requires-Dist: pyyaml
|
|
31
|
-
Requires-Dist: fastjsonschema
|
|
32
31
|
Provides-Extra: dev
|
|
33
32
|
Requires-Dist: black ; extra == 'dev'
|
|
34
33
|
Requires-Dist: coverage ; extra == 'dev'
|
|
@@ -50,19 +49,17 @@ Requires-Dist: vcrpy ; extra == 'test'
|
|
|
50
49
|
|
|
51
50
|
A simple python wrapper for the DocumentCloud API
|
|
52
51
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
52
|
+
- Documentation: [http://documentcloud.readthedocs.org/](http://documentcloud.readthedocs.org/)
|
|
53
|
+
- Issues: [https://github.com/muckrock/python-documentcloud/issues](https://github.com/muckrock/python-documentcloud/issues)
|
|
54
|
+
- Packaging: [https://pypi.python.org/pypi/python-documentcloud](https://pypi.python.org/pypi/python-documentcloud)
|
|
56
55
|
|
|
57
|
-
Features
|
|
58
|
-
--------
|
|
56
|
+
## Features
|
|
59
57
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
58
|
+
- Retrieve and edit documents and projects, both public and private, from documentcloud.org
|
|
59
|
+
- Upload PDFs into your documentcloud.org account and organize them into projects
|
|
60
|
+
- Download text and images extracted from your PDFs by DocumentCloud
|
|
63
61
|
|
|
64
|
-
Getting started
|
|
65
|
-
---------------
|
|
62
|
+
## Getting started
|
|
66
63
|
|
|
67
64
|
Installation is as easy as...
|
|
68
65
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
documentcloud/__init__.py,sha256=XAwOR6JYL-flQV_uC616AMA2rYiXTkeogNolqE6LzN4,220
|
|
2
|
+
documentcloud/addon.py,sha256=3FxQjm26jknjLdd-GuztiZO4Z7NcgXq4WqunE9oh2es,11754
|
|
3
|
+
documentcloud/annotations.py,sha256=wVe3wYzyTRvc_hJ3r0m6iyDf6WIFlaGcCnyah_r53pg,2538
|
|
4
|
+
documentcloud/base.py,sha256=S53bqF67-1CAseAqvzPgVbKn856Sdhnq6xmdnSUPgKU,6543
|
|
5
|
+
documentcloud/client.py,sha256=vZvAHd5-lQ8o1Id5esoPuZt5BOBeW74o2wJYz3o32rw,5843
|
|
6
|
+
documentcloud/constants.py,sha256=4GuvF140iB3-0lAvyLUVuuVu4PYjqdkHOAn49dEjsbQ,1333
|
|
7
|
+
documentcloud/documents.py,sha256=g73_THJLfiS5arQj5xT5BH4Z46KMQUllWsuIKGltSGI,19531
|
|
8
|
+
documentcloud/exceptions.py,sha256=nLyrg_6KhBotsZp0L6-Mf4A6HYYS60AV2FxSVSq9xAk,1161
|
|
9
|
+
documentcloud/organizations.py,sha256=_Ot6MWzoa5JdU3jqedU-0Fec_K8WrgxqdlIp4oIijes,392
|
|
10
|
+
documentcloud/projects.py,sha256=KuOiw65a-8fdgbjo7BqjbEbWguds8inkhFJZJd578bs,5328
|
|
11
|
+
documentcloud/sections.py,sha256=cMf973KMvp6fAPSMXCD67L32Pz1_Tfh81oV2q2UQ9Uk,924
|
|
12
|
+
documentcloud/toolbox.py,sha256=zFZTyOn40YZjBpqa1H3qjpR4C3Wu1X2g72AvH_ljlic,1835
|
|
13
|
+
documentcloud/users.py,sha256=yydOXoEsfJlYqryZpXQ4G3aeRc5y_QCHqXd0dfF1aIc,354
|
|
14
|
+
python_documentcloud-4.0.0.dist-info/LICENSE,sha256=Z1IBhHCzIeGR9F2iHtcLt2I2qoUhJ2pK139CAIAuFgo,1151
|
|
15
|
+
python_documentcloud-4.0.0.dist-info/METADATA,sha256=l7T27s2poNlDB3JS_Ighte1FQOR4vFfs_DQTUfZj73A,2696
|
|
16
|
+
python_documentcloud-4.0.0.dist-info/WHEEL,sha256=z9j0xAa_JmUKMpmz72K0ZGALSM_n-wQVmGbleXx2VHg,110
|
|
17
|
+
python_documentcloud-4.0.0.dist-info/top_level.txt,sha256=rzNW2vA9GqU5ipNQYSP1XJQ54ippjKXVIo9oMlM0Tm4,14
|
|
18
|
+
python_documentcloud-4.0.0.dist-info/RECORD,,
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
documentcloud/__init__.py,sha256=XAwOR6JYL-flQV_uC616AMA2rYiXTkeogNolqE6LzN4,220
|
|
2
|
-
documentcloud/addon.py,sha256=7VCe-ilQpn3f31cz2lzPTWdAJbac4BTTZxYjE2yhtQw,10750
|
|
3
|
-
documentcloud/annotations.py,sha256=g835EM9kQ53SWALJaztKCXflY62Go4NmW3jng0sQBIU,2712
|
|
4
|
-
documentcloud/base.py,sha256=z8knHJOQHx1AbeGzCv_3gYL-Byx3_EcsvCf5K_ciiao,7143
|
|
5
|
-
documentcloud/client.py,sha256=Z2d1au1NxjcXxZKpFV990C-ekYRNU-3_d1f02GYCgPc,5974
|
|
6
|
-
documentcloud/constants.py,sha256=wnFesv6UqpV7Qe2r77_T0ymViebBmN8iJWXib_TPeW0,2144
|
|
7
|
-
documentcloud/documents.py,sha256=GihIUThgedoIB-TBemXPsVIP_yT3EMixGuKTKpGJLvk,19567
|
|
8
|
-
documentcloud/exceptions.py,sha256=0YolxmYIXo5ppvVdAwuQDaUclHuSd4NVEmjYALmVO68,1270
|
|
9
|
-
documentcloud/organizations.py,sha256=UQWhqPQBRWMyORpFVPJ2A4iDSSingoaU-EMueg57_R0,565
|
|
10
|
-
documentcloud/projects.py,sha256=gRgWpIZ-NDWTRRKvDg2NcU1EAg7_1qtIINlmQZurncQ,5648
|
|
11
|
-
documentcloud/sections.py,sha256=gET8iNfuXIX8HUaftXtUKQxGwXcjoSo2Ws8kGYTyjh8,1123
|
|
12
|
-
documentcloud/toolbox.py,sha256=Gg4U1rTyogqM6o8x7HNouQiuBYaE4rarzoVbnqNrztw,2012
|
|
13
|
-
documentcloud/users.py,sha256=byjjVmNyvJR8sM0Vs67uX6kQ5xhxFdnrYYDFjPGbBD8,527
|
|
14
|
-
python_documentcloud-3.7.1.dist-info/LICENSE,sha256=Z1IBhHCzIeGR9F2iHtcLt2I2qoUhJ2pK139CAIAuFgo,1151
|
|
15
|
-
python_documentcloud-3.7.1.dist-info/METADATA,sha256=NyAnhN7NrZxDu2DohrkjY2tvuPSyItdcTScJrcYNxes,2751
|
|
16
|
-
python_documentcloud-3.7.1.dist-info/WHEEL,sha256=z9j0xAa_JmUKMpmz72K0ZGALSM_n-wQVmGbleXx2VHg,110
|
|
17
|
-
python_documentcloud-3.7.1.dist-info/top_level.txt,sha256=rzNW2vA9GqU5ipNQYSP1XJQ54ippjKXVIo9oMlM0Tm4,14
|
|
18
|
-
python_documentcloud-3.7.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|