python-documentcloud 3.8.0__tar.gz → 4.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python-documentcloud-3.8.0/python_documentcloud.egg-info → python-documentcloud-4.0.0}/PKG-INFO +5 -6
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/addon.py +6 -3
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/annotations.py +3 -8
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/base.py +21 -40
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/client.py +8 -13
- python-documentcloud-4.0.0/documentcloud/constants.py +101 -0
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/documents.py +69 -68
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/exceptions.py +2 -4
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/organizations.py +0 -7
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/projects.py +11 -22
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/sections.py +4 -11
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/toolbox.py +4 -9
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/users.py +0 -7
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0/python_documentcloud.egg-info}/PKG-INFO +5 -6
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/setup.py +5 -6
- python-documentcloud-3.8.0/documentcloud/constants.py +0 -104
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/LICENSE +0 -0
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/README.md +0 -0
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/documentcloud/__init__.py +0 -0
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/python_documentcloud.egg-info/SOURCES.txt +0 -0
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/python_documentcloud.egg-info/dependency_links.txt +0 -0
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/python_documentcloud.egg-info/requires.txt +2 -2
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/python_documentcloud.egg-info/top_level.txt +0 -0
- {python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/setup.cfg +0 -0
{python-documentcloud-3.8.0/python_documentcloud.egg-info → python-documentcloud-4.0.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: python-documentcloud
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.0
|
|
4
4
|
Summary: A simple Python wrapper for the DocumentCloud API
|
|
5
5
|
Home-page: https://github.com/muckrock/python-documentcloud
|
|
6
6
|
Author: Mitchell Kotler
|
|
@@ -11,13 +11,12 @@ Classifier: Development Status :: 5 - Production/Stable
|
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Operating System :: OS Independent
|
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
-
Classifier: Programming Language :: Python
|
|
15
|
-
Classifier: Programming Language :: Python :: 2
|
|
16
|
-
Classifier: Programming Language :: Python :: 2.7
|
|
17
|
-
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.6
|
|
19
14
|
Classifier: Programming Language :: Python :: 3.7
|
|
20
15
|
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
20
|
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
21
|
Description-Content-Type: text/markdown
|
|
23
22
|
Provides-Extra: dev
|
|
@@ -24,7 +24,7 @@ class BaseAddOn:
|
|
|
24
24
|
|
|
25
25
|
def __init__(self):
|
|
26
26
|
args = self._parse_arguments()
|
|
27
|
-
|
|
27
|
+
self._create_client(args)
|
|
28
28
|
|
|
29
29
|
# a unique identifier for this run
|
|
30
30
|
self.id = args.pop("id", None)
|
|
@@ -65,7 +65,7 @@ class BaseAddOn:
|
|
|
65
65
|
self.client.refresh_token = args["refresh_token"]
|
|
66
66
|
if args["token"] is not None:
|
|
67
67
|
self.client.session.headers.update(
|
|
68
|
-
{"Authorization": "Bearer {
|
|
68
|
+
{"Authorization": f"Bearer {args['token']}"}
|
|
69
69
|
)
|
|
70
70
|
|
|
71
71
|
# custom user agent for AddOns
|
|
@@ -119,7 +119,7 @@ class BaseAddOn:
|
|
|
119
119
|
|
|
120
120
|
# validate parameter data
|
|
121
121
|
try:
|
|
122
|
-
with open("config.yaml") as config:
|
|
122
|
+
with open("config.yaml", encoding="utf-8") as config:
|
|
123
123
|
schema = yaml.safe_load(config)
|
|
124
124
|
args["data"] = fastjsonschema.validate(schema, args["data"])
|
|
125
125
|
# add title in case the add-on wants to reference its own title
|
|
@@ -175,6 +175,7 @@ class AddOn(BaseAddOn):
|
|
|
175
175
|
else:
|
|
176
176
|
# text file's buffer is in binary mode
|
|
177
177
|
data = file.buffer
|
|
178
|
+
# pylint: disable=W3101
|
|
178
179
|
response = requests.put(presigned_url, data=data)
|
|
179
180
|
response.raise_for_status()
|
|
180
181
|
return self.client.patch(
|
|
@@ -207,6 +208,8 @@ class AddOn(BaseAddOn):
|
|
|
207
208
|
documents = self.client.documents.search(self.query)
|
|
208
209
|
return documents.count
|
|
209
210
|
|
|
211
|
+
return 0
|
|
212
|
+
|
|
210
213
|
def get_documents(self):
|
|
211
214
|
"""Get documents from either selected or queried documents"""
|
|
212
215
|
if self.documents:
|
|
@@ -1,8 +1,4 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
1
|
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
2
|
from listcrunch.listcrunch import uncrunch
|
|
7
3
|
|
|
8
4
|
# Local
|
|
@@ -10,7 +6,6 @@ from .base import BaseAPIObject, ChildAPIClient
|
|
|
10
6
|
from .toolbox import merge_dicts
|
|
11
7
|
|
|
12
8
|
|
|
13
|
-
@python_2_unicode_compatible
|
|
14
9
|
class Annotation(BaseAPIObject):
|
|
15
10
|
"""A note on a document"""
|
|
16
11
|
|
|
@@ -30,7 +25,7 @@ class Annotation(BaseAPIObject):
|
|
|
30
25
|
|
|
31
26
|
@property
|
|
32
27
|
def api_path(self):
|
|
33
|
-
return "documents/{
|
|
28
|
+
return f"documents/{self.document.id}/notes"
|
|
34
29
|
|
|
35
30
|
@property
|
|
36
31
|
def location(self):
|
|
@@ -71,7 +66,7 @@ class AnnotationClient(ChildAPIClient):
|
|
|
71
66
|
|
|
72
67
|
@property
|
|
73
68
|
def api_path(self):
|
|
74
|
-
return "documents/{
|
|
69
|
+
return f"documents/{self.parent.id}/notes"
|
|
75
70
|
|
|
76
71
|
def create(
|
|
77
72
|
self,
|
|
@@ -102,7 +97,7 @@ class AnnotationClient(ChildAPIClient):
|
|
|
102
97
|
"x2": x2,
|
|
103
98
|
"y2": y2,
|
|
104
99
|
}
|
|
105
|
-
response = self.client.post(self.api_path
|
|
100
|
+
response = self.client.post(f"{self.api_path}/", json=data)
|
|
106
101
|
return Annotation(
|
|
107
102
|
self.client, merge_dicts(response.json(), {"document": self.parent})
|
|
108
103
|
)
|
|
@@ -1,20 +1,14 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
1
|
# Standard Library
|
|
5
|
-
from builtins import str
|
|
6
2
|
from copy import copy
|
|
7
3
|
|
|
8
4
|
# Third Party
|
|
9
5
|
from dateutil.parser import parse as dateparser
|
|
10
|
-
from future.utils import python_2_unicode_compatible
|
|
11
6
|
|
|
12
7
|
# Local
|
|
13
8
|
from .exceptions import DuplicateObjectError
|
|
14
9
|
from .toolbox import get_id, merge_dicts
|
|
15
10
|
|
|
16
11
|
|
|
17
|
-
@python_2_unicode_compatible
|
|
18
12
|
class APIResults(object):
|
|
19
13
|
"""Class for encapsulating paginated list results from the API"""
|
|
20
14
|
|
|
@@ -39,10 +33,10 @@ class APIResults(object):
|
|
|
39
33
|
]
|
|
40
34
|
|
|
41
35
|
def __repr__(self):
|
|
42
|
-
return "<APIResults: {!r}"
|
|
36
|
+
return f"<APIResults: {self.results!r}>" # pragma: no cover
|
|
43
37
|
|
|
44
38
|
def __str__(self):
|
|
45
|
-
return "[{
|
|
39
|
+
return f"[{', '.join(str(r) for r in self.results)}]"
|
|
46
40
|
|
|
47
41
|
def __getitem__(self, key):
|
|
48
42
|
# pylint: disable=unsubscriptable-object
|
|
@@ -104,21 +98,19 @@ class BaseAPIClient(object):
|
|
|
104
98
|
params = {"expand": ",".join(expand)}
|
|
105
99
|
else:
|
|
106
100
|
params = {}
|
|
107
|
-
response = self.client.get(
|
|
108
|
-
"{}/{}/".format(self.api_path, get_id(id_)), params=params
|
|
109
|
-
)
|
|
101
|
+
response = self.client.get(f"{self.api_path}/{get_id(id_)}/", params=params)
|
|
110
102
|
# pylint: disable=not-callable
|
|
111
103
|
return self.resource(self.client, response.json())
|
|
112
104
|
|
|
113
105
|
def delete(self, id_):
|
|
114
106
|
"""Deletes a resource"""
|
|
115
|
-
self.client.delete("{}/{
|
|
107
|
+
self.client.delete(f"{self.api_path}/{get_id(id_)}")
|
|
116
108
|
|
|
117
109
|
def all(self, **params):
|
|
118
110
|
return self.list(**params)
|
|
119
111
|
|
|
120
112
|
def list(self, **params):
|
|
121
|
-
response = self.client.get(self.api_path
|
|
113
|
+
response = self.client.get(f"{self.api_path}/", params=params)
|
|
122
114
|
return APIResults(self.resource, self.client, response)
|
|
123
115
|
|
|
124
116
|
|
|
@@ -126,11 +118,11 @@ class ChildAPIClient(BaseAPIClient):
|
|
|
126
118
|
"""Base client for sub resources"""
|
|
127
119
|
|
|
128
120
|
def __init__(self, client, parent):
|
|
129
|
-
super(
|
|
121
|
+
super().__init__(client)
|
|
130
122
|
self.parent = parent
|
|
131
123
|
|
|
132
124
|
def list(self, **params):
|
|
133
|
-
response = self.client.get(self.api_path
|
|
125
|
+
response = self.client.get(f"{self.api_path}/", params=params)
|
|
134
126
|
parent_name = self.parent.__class__.__name__.lower()
|
|
135
127
|
return APIResults(
|
|
136
128
|
self.resource, self.client, response, {parent_name: self.parent}
|
|
@@ -156,9 +148,7 @@ class BaseAPIObject(object):
|
|
|
156
148
|
setattr(self, field, dateparser(getattr(self, field)))
|
|
157
149
|
|
|
158
150
|
def __repr__(self):
|
|
159
|
-
return "<{}: {} - {}>"
|
|
160
|
-
self.__class__.__name__, self.id, self
|
|
161
|
-
) # pragma: no cover
|
|
151
|
+
return f"<{self.__class__.__name__}: {self.id} - {self}>" # pragma: no cover
|
|
162
152
|
|
|
163
153
|
def __eq__(self, obj):
|
|
164
154
|
return isinstance(obj, type(self)) and self.id == obj.id
|
|
@@ -169,65 +159,56 @@ class BaseAPIObject(object):
|
|
|
169
159
|
|
|
170
160
|
def save(self):
|
|
171
161
|
data = {f: getattr(self, f) for f in self.writable_fields if hasattr(self, f)}
|
|
172
|
-
self._client.put("{}/{}/"
|
|
162
|
+
self._client.put(f"{self.api_path}/{self.id}/", json=data)
|
|
173
163
|
|
|
174
164
|
def delete(self):
|
|
175
|
-
self._client.delete("{}/{
|
|
165
|
+
self._client.delete(f"{self.api_path}/{self.id}")
|
|
176
166
|
|
|
177
167
|
|
|
178
|
-
@python_2_unicode_compatible
|
|
179
168
|
class APISet(list):
|
|
180
169
|
def __init__(self, iterable, resource):
|
|
181
|
-
super(
|
|
170
|
+
super().__init__(iterable)
|
|
182
171
|
self.resource = resource
|
|
183
172
|
if not all(isinstance(obj, self.resource) for obj in self):
|
|
184
173
|
raise TypeError(
|
|
185
|
-
"Only {} can be added to this list"
|
|
186
|
-
self.resource.__class__.__name__
|
|
187
|
-
)
|
|
174
|
+
f"Only {self.resource.__class__.__name__} can be added to this list"
|
|
188
175
|
)
|
|
189
176
|
ids = [obj.id for obj in self]
|
|
190
177
|
for id_ in ids:
|
|
191
178
|
if ids.count(id_) > 1:
|
|
192
179
|
raise DuplicateObjectError(
|
|
193
|
-
"Object with ID {} appears in the list more than once"
|
|
180
|
+
f"Object with ID {id_} appears in the list more than once"
|
|
194
181
|
)
|
|
195
182
|
|
|
196
183
|
def append(self, obj):
|
|
197
184
|
if not isinstance(obj, self.resource):
|
|
198
185
|
raise TypeError(
|
|
199
|
-
"Only {} can be added to this list"
|
|
200
|
-
self.resource.__class__.__name__
|
|
201
|
-
)
|
|
186
|
+
f"Only {self.resource.__class__.__name__} can be added to this list"
|
|
202
187
|
)
|
|
203
188
|
if obj.id in [i.id for i in self]:
|
|
204
189
|
raise DuplicateObjectError(
|
|
205
|
-
"Object with ID {} appears in the list more than once"
|
|
190
|
+
f"Object with ID {obj.id} appears in the list more than once"
|
|
206
191
|
)
|
|
207
|
-
super(
|
|
192
|
+
super().append(copy(obj))
|
|
208
193
|
|
|
209
194
|
def add(self, obj):
|
|
210
195
|
if not isinstance(obj, self.resource):
|
|
211
196
|
raise TypeError(
|
|
212
|
-
"Only {} can be added to this list"
|
|
213
|
-
self.resource.__class__.__name__
|
|
214
|
-
)
|
|
197
|
+
f"Only {self.resource.__class__.__name__} can be added to this list"
|
|
215
198
|
)
|
|
216
199
|
# skip duplicates silently
|
|
217
200
|
if obj.id not in [i.id for i in self]:
|
|
218
|
-
super(
|
|
201
|
+
super().append(copy(obj))
|
|
219
202
|
|
|
220
203
|
def extend(self, list_):
|
|
221
204
|
if not all(isinstance(obj, self.resource) for obj in list_):
|
|
222
205
|
raise TypeError(
|
|
223
|
-
"Only {} can be added to this list"
|
|
224
|
-
self.resource.__class__.__name__
|
|
225
|
-
)
|
|
206
|
+
f"Only {self.resource.__class__.__name__} can be added to this list"
|
|
226
207
|
)
|
|
227
208
|
ids = [obj.id for obj in self + list_]
|
|
228
209
|
for id_ in ids:
|
|
229
210
|
if ids.count(id_) > 1:
|
|
230
211
|
raise DuplicateObjectError(
|
|
231
|
-
"Object with ID {} appears in the list more than once"
|
|
212
|
+
f"Object with ID {id_} appears in the list more than once"
|
|
232
213
|
)
|
|
233
|
-
super(
|
|
214
|
+
super().extend(copy(obj) for obj in list_)
|
|
@@ -2,9 +2,6 @@
|
|
|
2
2
|
The public interface for the DocumentCloud API
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
# Future
|
|
6
|
-
from __future__ import division, print_function, unicode_literals
|
|
7
|
-
|
|
8
5
|
# Standard Library
|
|
9
6
|
import logging
|
|
10
7
|
from functools import partial
|
|
@@ -84,20 +81,18 @@ class DocumentCloud(object):
|
|
|
84
81
|
access_token = None
|
|
85
82
|
|
|
86
83
|
if access_token:
|
|
87
|
-
self.session.headers.update(
|
|
88
|
-
{"Authorization": "Bearer {}".format(access_token)}
|
|
89
|
-
)
|
|
84
|
+
self.session.headers.update({"Authorization": f"Bearer {access_token}"})
|
|
90
85
|
|
|
91
86
|
def _get_tokens(self, username, password):
|
|
92
87
|
"""Get an access and refresh token in exchange for the username and password"""
|
|
93
88
|
response = requests_retry_session().post(
|
|
94
|
-
"{}token/"
|
|
89
|
+
f"{self.auth_uri}token/",
|
|
95
90
|
json={"username": username, "password": password},
|
|
96
91
|
timeout=self.timeout,
|
|
97
92
|
)
|
|
98
93
|
|
|
99
94
|
if response.status_code == requests.codes.UNAUTHORIZED:
|
|
100
|
-
raise CredentialsFailedError("The username and password
|
|
95
|
+
raise CredentialsFailedError("The username and password are incorrect")
|
|
101
96
|
|
|
102
97
|
self.raise_for_status(response)
|
|
103
98
|
|
|
@@ -107,7 +102,7 @@ class DocumentCloud(object):
|
|
|
107
102
|
def _refresh_tokens(self, refresh_token):
|
|
108
103
|
"""Refresh the access and refresh tokens"""
|
|
109
104
|
response = requests_retry_session().post(
|
|
110
|
-
"{}refresh/"
|
|
105
|
+
f"{self.auth_uri}refresh/",
|
|
111
106
|
json={"refresh": refresh_token},
|
|
112
107
|
timeout=self.timeout,
|
|
113
108
|
)
|
|
@@ -136,7 +131,7 @@ class DocumentCloud(object):
|
|
|
136
131
|
full_url = kwargs.pop("full_url", False)
|
|
137
132
|
|
|
138
133
|
if not full_url:
|
|
139
|
-
url = "{}{}"
|
|
134
|
+
url = f"{self.base_uri}{url}"
|
|
140
135
|
|
|
141
136
|
# set the API to version 2.0
|
|
142
137
|
parsed_url = urlparse(url)
|
|
@@ -165,7 +160,7 @@ class DocumentCloud(object):
|
|
|
165
160
|
if attr in methods:
|
|
166
161
|
return partial(self._request, attr)
|
|
167
162
|
raise AttributeError(
|
|
168
|
-
"'{}' object has no attribute '{}'"
|
|
163
|
+
f"'{self.__class__.__name__}' object has no attribute '{attr}'"
|
|
169
164
|
)
|
|
170
165
|
|
|
171
166
|
def raise_for_status(self, response):
|
|
@@ -174,6 +169,6 @@ class DocumentCloud(object):
|
|
|
174
169
|
response.raise_for_status()
|
|
175
170
|
except requests.exceptions.RequestException as exc:
|
|
176
171
|
if exc.response.status_code == 404:
|
|
177
|
-
raise DoesNotExistError(response=exc.response)
|
|
172
|
+
raise DoesNotExistError(response=exc.response) from exc
|
|
178
173
|
else:
|
|
179
|
-
raise APIError(response=exc.response)
|
|
174
|
+
raise APIError(response=exc.response) from exc
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
PER_PAGE_MAX = 100
|
|
2
|
+
BULK_LIMIT = 25
|
|
3
|
+
BASE_URI = "https://api.www.documentcloud.org/api/"
|
|
4
|
+
AUTH_URI = "https://accounts.muckrock.com/api/"
|
|
5
|
+
TIMEOUT = 20
|
|
6
|
+
RATE_LIMIT = 10
|
|
7
|
+
RATE_PERIOD = 1
|
|
8
|
+
SUPPORTED_EXTENSIONS = [
|
|
9
|
+
".abw",
|
|
10
|
+
".zabw",
|
|
11
|
+
".md",
|
|
12
|
+
".pm3",
|
|
13
|
+
".pm4",
|
|
14
|
+
".pm5",
|
|
15
|
+
".pm6",
|
|
16
|
+
".p65",
|
|
17
|
+
".cwk",
|
|
18
|
+
".agd",
|
|
19
|
+
".fhd",
|
|
20
|
+
".kth",
|
|
21
|
+
".key",
|
|
22
|
+
".numbers",
|
|
23
|
+
".pages",
|
|
24
|
+
".bmp",
|
|
25
|
+
".csv",
|
|
26
|
+
".txt",
|
|
27
|
+
".cdr",
|
|
28
|
+
".cmx",
|
|
29
|
+
".cgm",
|
|
30
|
+
".dif",
|
|
31
|
+
".dbf",
|
|
32
|
+
".xml",
|
|
33
|
+
".eps",
|
|
34
|
+
".emf",
|
|
35
|
+
".fb2",
|
|
36
|
+
".gnm",
|
|
37
|
+
".gnumeric",
|
|
38
|
+
".gif",
|
|
39
|
+
".hwp",
|
|
40
|
+
".plt",
|
|
41
|
+
".html",
|
|
42
|
+
".htm",
|
|
43
|
+
".jtd",
|
|
44
|
+
".jtt",
|
|
45
|
+
".jpg",
|
|
46
|
+
".jpeg",
|
|
47
|
+
".wk1",
|
|
48
|
+
".wks",
|
|
49
|
+
".123",
|
|
50
|
+
".wk3",
|
|
51
|
+
".wk4",
|
|
52
|
+
".pct",
|
|
53
|
+
".mml",
|
|
54
|
+
".xls",
|
|
55
|
+
".xlw",
|
|
56
|
+
".xlt",
|
|
57
|
+
".xlsx",
|
|
58
|
+
".docx",
|
|
59
|
+
".pptx",
|
|
60
|
+
".ppt",
|
|
61
|
+
".pps",
|
|
62
|
+
".pot",
|
|
63
|
+
".pptx",
|
|
64
|
+
".pub",
|
|
65
|
+
".rtf",
|
|
66
|
+
".xml",
|
|
67
|
+
".doc",
|
|
68
|
+
".dot",
|
|
69
|
+
".docx",
|
|
70
|
+
".wps",
|
|
71
|
+
".wks",
|
|
72
|
+
".wdb",
|
|
73
|
+
".wri",
|
|
74
|
+
".vsd",
|
|
75
|
+
".pgm",
|
|
76
|
+
".pbm",
|
|
77
|
+
".ppm",
|
|
78
|
+
".odt",
|
|
79
|
+
".fodt",
|
|
80
|
+
".ods",
|
|
81
|
+
".fods",
|
|
82
|
+
".odp",
|
|
83
|
+
".fodp",
|
|
84
|
+
".odg",
|
|
85
|
+
".fodg",
|
|
86
|
+
".odf",
|
|
87
|
+
".odb",
|
|
88
|
+
".sxw",
|
|
89
|
+
".stw",
|
|
90
|
+
".sxc",
|
|
91
|
+
".stc",
|
|
92
|
+
".sxi",
|
|
93
|
+
".sti",
|
|
94
|
+
".sxd",
|
|
95
|
+
".std",
|
|
96
|
+
".sxm",
|
|
97
|
+
".pcx",
|
|
98
|
+
".pcd",
|
|
99
|
+
".psd",
|
|
100
|
+
".pdf",
|
|
101
|
+
]
|
|
@@ -2,19 +2,15 @@
|
|
|
2
2
|
Documents
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
# Future
|
|
6
|
-
from __future__ import division, print_function, unicode_literals
|
|
7
|
-
|
|
8
5
|
# Standard Library
|
|
6
|
+
import datetime
|
|
9
7
|
import logging
|
|
10
8
|
import os
|
|
11
9
|
import re
|
|
12
10
|
import warnings
|
|
13
|
-
import datetime
|
|
14
11
|
from functools import partial
|
|
15
12
|
|
|
16
13
|
# Third Party
|
|
17
|
-
from future.utils import python_2_unicode_compatible
|
|
18
14
|
from requests.exceptions import RequestException
|
|
19
15
|
|
|
20
16
|
# Local
|
|
@@ -32,13 +28,11 @@ try:
|
|
|
32
28
|
except ImportError:
|
|
33
29
|
from urlparse import urlparse
|
|
34
30
|
|
|
35
|
-
|
|
36
31
|
logger = logging.getLogger("documentcloud")
|
|
37
32
|
|
|
38
33
|
IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]
|
|
39
34
|
|
|
40
35
|
|
|
41
|
-
@python_2_unicode_compatible
|
|
42
36
|
class Document(BaseAPIObject):
|
|
43
37
|
"""A single DocumentCloud document"""
|
|
44
38
|
|
|
@@ -62,13 +56,13 @@ class Document(BaseAPIObject):
|
|
|
62
56
|
for name, resource in objs:
|
|
63
57
|
value = dict_.get(name)
|
|
64
58
|
if isinstance(value, dict):
|
|
65
|
-
dict_["_"
|
|
66
|
-
dict_[name
|
|
59
|
+
dict_[f"_{name}"] = resource(client, value)
|
|
60
|
+
dict_[f"{name}_id"] = value.get("id")
|
|
67
61
|
elif isinstance(value, int):
|
|
68
|
-
dict_["_"
|
|
69
|
-
dict_[name
|
|
62
|
+
dict_[f"_{name}"] = None
|
|
63
|
+
dict_[f"{name}_id"] = value
|
|
70
64
|
|
|
71
|
-
super(
|
|
65
|
+
super().__init__(client, dict_)
|
|
72
66
|
|
|
73
67
|
self.sections = SectionClient(client, self)
|
|
74
68
|
self.annotations = AnnotationClient(client, self)
|
|
@@ -89,13 +83,13 @@ class Document(BaseAPIObject):
|
|
|
89
83
|
fmt = "json" if json else "text" if text else None
|
|
90
84
|
# this allows dropping `get_` to act like a property, ie
|
|
91
85
|
# .full_text_url
|
|
92
|
-
if not get and hasattr(self, "get_{}"
|
|
93
|
-
return getattr(self, "get_{}"
|
|
86
|
+
if not get and hasattr(self, f"get_{attr}"):
|
|
87
|
+
return getattr(self, f"get_{attr}")()
|
|
94
88
|
# this allows dropping `_url` to fetch the url, ie
|
|
95
89
|
# .get_full_text()
|
|
96
|
-
if not url and hasattr(self, "{}_url"
|
|
90
|
+
if not url and hasattr(self, f"{attr}_url"):
|
|
97
91
|
return lambda *a, **k: self._get_url(
|
|
98
|
-
getattr(self, "{}_url"
|
|
92
|
+
getattr(self, f"{attr}_url")(*a, **k), fmt
|
|
99
93
|
)
|
|
100
94
|
# this genericizes the image sizes
|
|
101
95
|
m_image = p_image.match(attr)
|
|
@@ -104,7 +98,7 @@ class Document(BaseAPIObject):
|
|
|
104
98
|
if m_image and not m_image.group("list"):
|
|
105
99
|
return partial(self.get_image_url, size=m_image.group("size"))
|
|
106
100
|
raise AttributeError(
|
|
107
|
-
"'{}' object has no attribute '{}'"
|
|
101
|
+
f"'{self.__class__.__name__}' object has no attribute '{attr}'"
|
|
108
102
|
)
|
|
109
103
|
|
|
110
104
|
def __dir__(self):
|
|
@@ -115,12 +109,12 @@ class Document(BaseAPIObject):
|
|
|
115
109
|
attrs += [a[len("get_") : -len("_url")] for a in getters if a.endswith("url")]
|
|
116
110
|
for size in IMAGE_SIZES:
|
|
117
111
|
attrs += [
|
|
118
|
-
"get_{}_image_url"
|
|
119
|
-
"{}_image_url"
|
|
120
|
-
"get_{}_image"
|
|
121
|
-
"{}_image"
|
|
122
|
-
"get_{}_image_url_list"
|
|
123
|
-
"{}_image_url_list"
|
|
112
|
+
f"get_{size}_image_url",
|
|
113
|
+
f"{size}_image_url",
|
|
114
|
+
f"get_{size}_image",
|
|
115
|
+
f"{size}_image",
|
|
116
|
+
f"get_{size}_image_url_list",
|
|
117
|
+
f"{size}_image_url_list",
|
|
124
118
|
]
|
|
125
119
|
return sorted(attrs)
|
|
126
120
|
|
|
@@ -187,27 +181,26 @@ class Document(BaseAPIObject):
|
|
|
187
181
|
|
|
188
182
|
# Resource URLs
|
|
189
183
|
def get_full_text_url(self):
|
|
190
|
-
return "{}documents/{}/{}.txt"
|
|
184
|
+
return f"{self.asset_url}documents/{self.id}/{self.slug}.txt"
|
|
191
185
|
|
|
192
186
|
def get_page_text_url(self, page=1):
|
|
193
|
-
return "{}documents/{}/pages/{}-p{}.txt"
|
|
194
|
-
self.asset_url, self.id, self.slug, page
|
|
195
|
-
)
|
|
187
|
+
return f"{self.asset_url}documents/{self.id}/pages/{self.slug}-p{page}.txt"
|
|
196
188
|
|
|
197
189
|
def get_page_position_json_url(self, page=1):
|
|
198
|
-
return
|
|
199
|
-
self.asset_url
|
|
190
|
+
return (
|
|
191
|
+
f"{self.asset_url}documents/{self.id}/pages/"
|
|
192
|
+
f"{self.slug}-p{page}.position.json"
|
|
200
193
|
)
|
|
201
194
|
|
|
202
195
|
def get_json_text_url(self):
|
|
203
|
-
return "{}documents/{}/{}.txt.json"
|
|
196
|
+
return f"{self.asset_url}documents/{self.id}/{self.slug}.txt.json"
|
|
204
197
|
|
|
205
198
|
def get_pdf_url(self):
|
|
206
|
-
return "{}documents/{}/{}.pdf"
|
|
199
|
+
return f"{self.asset_url}documents/{self.id}/{self.slug}.pdf"
|
|
207
200
|
|
|
208
201
|
def get_image_url(self, page=1, size="normal"):
|
|
209
|
-
return
|
|
210
|
-
self.asset_url
|
|
202
|
+
return (
|
|
203
|
+
f"{self.asset_url}documents/{self.id}/pages/{self.slug}-p{page}-{size}.gif"
|
|
211
204
|
)
|
|
212
205
|
|
|
213
206
|
def get_image_url_list(self, size="normal"):
|
|
@@ -217,7 +210,7 @@ class Document(BaseAPIObject):
|
|
|
217
210
|
|
|
218
211
|
def get_errors(self):
|
|
219
212
|
"""Retrieve errors for the document"""
|
|
220
|
-
endpoint = "documents/{}/errors/"
|
|
213
|
+
endpoint = f"documents/{self.id}/errors/"
|
|
221
214
|
all_results = []
|
|
222
215
|
|
|
223
216
|
while endpoint:
|
|
@@ -239,7 +232,7 @@ class Document(BaseAPIObject):
|
|
|
239
232
|
|
|
240
233
|
def process(self):
|
|
241
234
|
"""Reprocess the document"""
|
|
242
|
-
self._client.post("{}/{}/process/"
|
|
235
|
+
self._client.post(f"{self.api_path}/{self.id}/process/")
|
|
243
236
|
|
|
244
237
|
|
|
245
238
|
class DocumentClient(BaseAPIClient):
|
|
@@ -275,6 +268,16 @@ class DocumentClient(BaseAPIClient):
|
|
|
275
268
|
|
|
276
269
|
def upload(self, pdf, **kwargs):
|
|
277
270
|
"""Upload a document"""
|
|
271
|
+
|
|
272
|
+
def check_size(size):
|
|
273
|
+
# DocumentCloud's size limit is set to 501MB to give people a little leeway
|
|
274
|
+
# for OS rounding
|
|
275
|
+
if size >= 501 * 1024 * 1024:
|
|
276
|
+
raise ValueError(
|
|
277
|
+
"The pdf you have submitted is over the DocumentCloud API's 500MB "
|
|
278
|
+
"file size limit. Split it into smaller pieces and try again."
|
|
279
|
+
)
|
|
280
|
+
|
|
278
281
|
# if they pass in a URL, use the URL upload flow
|
|
279
282
|
if is_url(pdf):
|
|
280
283
|
return self._upload_url(pdf, **kwargs)
|
|
@@ -285,19 +288,13 @@ class DocumentClient(BaseAPIClient):
|
|
|
285
288
|
size = os.fstat(pdf.fileno()).st_size
|
|
286
289
|
except (AttributeError, OSError): # pragma: no cover
|
|
287
290
|
size = 0
|
|
291
|
+
check_size(size)
|
|
292
|
+
return self._upload_file(pdf, **kwargs)
|
|
288
293
|
else:
|
|
289
294
|
size = os.path.getsize(pdf)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
# for OS rounding
|
|
294
|
-
if size >= 501 * 1024 * 1024:
|
|
295
|
-
raise ValueError(
|
|
296
|
-
"The pdf you have submitted is over the DocumentCloud API's 500MB "
|
|
297
|
-
"file size limit. Split it into smaller pieces and try again."
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
return self._upload_file(pdf, **kwargs)
|
|
295
|
+
check_size(size)
|
|
296
|
+
with open(pdf, "rb") as pdf_file:
|
|
297
|
+
return self._upload_file(pdf_file, **kwargs)
|
|
301
298
|
|
|
302
299
|
def _format_upload_parameters(self, name, **kwargs):
|
|
303
300
|
"""Prepare upload parameters from kwargs"""
|
|
@@ -331,9 +328,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
331
328
|
|
|
332
329
|
for param in ignored_parameters:
|
|
333
330
|
if param in kwargs:
|
|
334
|
-
warnings.warn(
|
|
335
|
-
"The parameter `{}` is not currently supported".format(param)
|
|
336
|
-
)
|
|
331
|
+
warnings.warn(f"The parameter `{param}` is not currently supported")
|
|
337
332
|
|
|
338
333
|
return params
|
|
339
334
|
|
|
@@ -363,7 +358,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
363
358
|
# begin processing the document
|
|
364
359
|
doc_id = create_json["id"]
|
|
365
360
|
response = self.client.post(
|
|
366
|
-
"documents/{}/process/"
|
|
361
|
+
f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
|
|
367
362
|
)
|
|
368
363
|
|
|
369
364
|
return Document(self.client, create_json)
|
|
@@ -383,11 +378,13 @@ class DocumentClient(BaseAPIClient):
|
|
|
383
378
|
|
|
384
379
|
def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
|
|
385
380
|
"""Upload files with specified extensions in a directory"""
|
|
381
|
+
# pylint: disable=too-many-locals, too-many-branches
|
|
386
382
|
|
|
387
383
|
# Do not set the same title for all documents
|
|
388
384
|
kwargs.pop("title", None)
|
|
389
385
|
|
|
390
|
-
# If extensions
|
|
386
|
+
# If extensions are specified as None, it will check for all supported
|
|
387
|
+
# filetypes.
|
|
391
388
|
if extensions is None:
|
|
392
389
|
extensions = SUPPORTED_EXTENSIONS
|
|
393
390
|
|
|
@@ -406,7 +403,9 @@ class DocumentClient(BaseAPIClient):
|
|
|
406
403
|
path_list = self._collect_files(path, extensions)
|
|
407
404
|
|
|
408
405
|
logger.info(
|
|
409
|
-
"Upload directory on %s: Found %d files to upload",
|
|
406
|
+
"Upload directory on %s: Found %d files to upload",
|
|
407
|
+
path,
|
|
408
|
+
len(path_list)
|
|
410
409
|
)
|
|
411
410
|
|
|
412
411
|
# Upload all the files using the bulk API to reduce the number
|
|
@@ -417,7 +416,7 @@ class DocumentClient(BaseAPIClient):
|
|
|
417
416
|
# Grouper will put None's on the end of the last group
|
|
418
417
|
file_paths = [p for p in file_paths if p is not None]
|
|
419
418
|
|
|
420
|
-
logger.info("Uploading group %d
|
|
419
|
+
logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
|
|
421
420
|
|
|
422
421
|
# Create the documents
|
|
423
422
|
logger.info("Creating the documents...")
|
|
@@ -442,9 +441,9 @@ class DocumentClient(BaseAPIClient):
|
|
|
442
441
|
except (APIError, RequestException) as exc:
|
|
443
442
|
if handle_errors:
|
|
444
443
|
logger.info(
|
|
445
|
-
"Error creating the following documents: %s
|
|
444
|
+
"Error creating the following documents: %s\n%s",
|
|
446
445
|
exc,
|
|
447
|
-
"\n".join(file_paths)
|
|
446
|
+
"\n".join(file_paths)
|
|
448
447
|
)
|
|
449
448
|
continue
|
|
450
449
|
else:
|
|
@@ -457,16 +456,15 @@ class DocumentClient(BaseAPIClient):
|
|
|
457
456
|
for url, file_path in zip(presigned_urls, file_paths):
|
|
458
457
|
logger.info("Uploading %s to S3...", file_path)
|
|
459
458
|
try:
|
|
460
|
-
|
|
461
|
-
url, data=
|
|
462
|
-
)
|
|
459
|
+
with open(file_path, "rb") as file:
|
|
460
|
+
response = requests_retry_session().put(url, data=file.read())
|
|
463
461
|
self.client.raise_for_status(response)
|
|
464
462
|
except (APIError, RequestException) as exc:
|
|
465
463
|
if handle_errors:
|
|
466
464
|
logger.info(
|
|
467
465
|
"Error uploading the following document: %s %s",
|
|
468
466
|
exc,
|
|
469
|
-
file_path
|
|
467
|
+
file_path
|
|
470
468
|
)
|
|
471
469
|
continue
|
|
472
470
|
else:
|
|
@@ -480,9 +478,9 @@ class DocumentClient(BaseAPIClient):
|
|
|
480
478
|
except (APIError, RequestException) as exc:
|
|
481
479
|
if handle_errors:
|
|
482
480
|
logger.info(
|
|
483
|
-
"Error creating the following documents: %s
|
|
481
|
+
"Error creating the following documents: %s\n%s",
|
|
484
482
|
exc,
|
|
485
|
-
"\n".join(file_paths)
|
|
483
|
+
"\n".join(file_paths)
|
|
486
484
|
)
|
|
487
485
|
continue
|
|
488
486
|
else:
|
|
@@ -505,7 +503,11 @@ class DocumentClient(BaseAPIClient):
|
|
|
505
503
|
# Grouper will put None's on the end of the last group
|
|
506
504
|
url_group = [url for url in url_group if url is not None]
|
|
507
505
|
|
|
508
|
-
logger.info(
|
|
506
|
+
logger.info(
|
|
507
|
+
"Uploading group %d: %s",
|
|
508
|
+
i + 1,
|
|
509
|
+
"\n".join(url_group)
|
|
510
|
+
)
|
|
509
511
|
|
|
510
512
|
# Create the documents
|
|
511
513
|
logger.info("Creating the documents...")
|
|
@@ -526,9 +528,9 @@ class DocumentClient(BaseAPIClient):
|
|
|
526
528
|
except (APIError, RequestException) as exc:
|
|
527
529
|
if handle_errors:
|
|
528
530
|
logger.info(
|
|
529
|
-
"Error creating the following documents: %s
|
|
530
|
-
exc,
|
|
531
|
-
"\n".join(url_group)
|
|
531
|
+
"Error creating the following documents: %s\n%s",
|
|
532
|
+
str(exc),
|
|
533
|
+
"\n".join(url_group)
|
|
532
534
|
)
|
|
533
535
|
continue
|
|
534
536
|
else:
|
|
@@ -543,7 +545,6 @@ class DocumentClient(BaseAPIClient):
|
|
|
543
545
|
return [Document(self.client, d) for d in obj_list]
|
|
544
546
|
|
|
545
547
|
|
|
546
|
-
@python_2_unicode_compatible
|
|
547
548
|
class Mention:
|
|
548
549
|
"""A snippet from a document search"""
|
|
549
550
|
|
|
@@ -554,7 +555,7 @@ class Mention:
|
|
|
554
555
|
self.text = text
|
|
555
556
|
|
|
556
557
|
def __repr__(self):
|
|
557
|
-
return "<{}: {}>"
|
|
558
|
+
return f"<{self.__class__.__name__}: {self}>" # pragma: no cover
|
|
558
559
|
|
|
559
560
|
def __str__(self):
|
|
560
|
-
return '{} - "{}"'
|
|
561
|
+
return f'{self.page} - "{self.text}"'
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Custom exceptions for python-documentcloud
|
|
3
3
|
"""
|
|
4
|
-
# Future
|
|
5
|
-
from __future__ import division, print_function, unicode_literals
|
|
6
4
|
|
|
7
5
|
|
|
8
6
|
class DocumentCloudError(Exception):
|
|
@@ -14,11 +12,11 @@ class DocumentCloudError(Exception):
|
|
|
14
12
|
self.error = self.response.text
|
|
15
13
|
self.status_code = self.response.status_code
|
|
16
14
|
if not args:
|
|
17
|
-
args = ["{} - {
|
|
15
|
+
args = [f"{self.status_code} - {self.error}"]
|
|
18
16
|
else:
|
|
19
17
|
self.error = None
|
|
20
18
|
self.status_code = None
|
|
21
|
-
super(
|
|
19
|
+
super().__init__(*args, **kwargs)
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
class DuplicateObjectError(DocumentCloudError):
|
|
@@ -1,14 +1,7 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
|
-
|
|
7
1
|
# Local
|
|
8
2
|
from .base import BaseAPIClient, BaseAPIObject
|
|
9
3
|
|
|
10
4
|
|
|
11
|
-
@python_2_unicode_compatible
|
|
12
5
|
class Organization(BaseAPIObject):
|
|
13
6
|
"""A documentcloud organization"""
|
|
14
7
|
|
|
@@ -1,9 +1,3 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
|
-
|
|
7
1
|
# Local
|
|
8
2
|
from .base import APISet, BaseAPIClient, BaseAPIObject
|
|
9
3
|
from .constants import BULK_LIMIT, PER_PAGE_MAX
|
|
@@ -12,7 +6,6 @@ from .exceptions import DoesNotExistError, MultipleObjectsReturnedError
|
|
|
12
6
|
from .toolbox import get_id, grouper
|
|
13
7
|
|
|
14
8
|
|
|
15
|
-
@python_2_unicode_compatible
|
|
16
9
|
class Project(BaseAPIObject):
|
|
17
10
|
"""A documentcloud project"""
|
|
18
11
|
|
|
@@ -21,7 +14,7 @@ class Project(BaseAPIObject):
|
|
|
21
14
|
|
|
22
15
|
def __init__(self, *args, **kwargs):
|
|
23
16
|
per_page = kwargs.pop("per_page", PER_PAGE_MAX)
|
|
24
|
-
super(
|
|
17
|
+
super().__init__(*args, **kwargs)
|
|
25
18
|
self._document_list = None
|
|
26
19
|
self._per_page = per_page
|
|
27
20
|
|
|
@@ -30,7 +23,7 @@ class Project(BaseAPIObject):
|
|
|
30
23
|
|
|
31
24
|
def save(self):
|
|
32
25
|
"""Add the documents to the project as well"""
|
|
33
|
-
super(
|
|
26
|
+
super().save()
|
|
34
27
|
if self._document_list:
|
|
35
28
|
self.clear_documents()
|
|
36
29
|
self.add_documents(self._document_list)
|
|
@@ -39,7 +32,7 @@ class Project(BaseAPIObject):
|
|
|
39
32
|
def document_list(self):
|
|
40
33
|
if self._document_list is None:
|
|
41
34
|
response = self._client.get(
|
|
42
|
-
"{}/{
|
|
35
|
+
f"{self.api_path}/{get_id(self.id)}/documents/",
|
|
43
36
|
params={"per_page": self._per_page, "expand": ["document"]},
|
|
44
37
|
)
|
|
45
38
|
json = response.json()
|
|
@@ -78,16 +71,14 @@ class Project(BaseAPIObject):
|
|
|
78
71
|
|
|
79
72
|
def get_document(self, doc_id):
|
|
80
73
|
response = self._client.get(
|
|
81
|
-
"{}/{}/documents/{}"
|
|
74
|
+
f"{self.api_path}/{get_id(self.id)}/documents/{doc_id}",
|
|
82
75
|
params={"expand": ["document"]},
|
|
83
76
|
)
|
|
84
77
|
return Document(self._client, response.json()["document"])
|
|
85
78
|
|
|
86
79
|
def clear_documents(self):
|
|
87
80
|
"""Remove all documents from this project"""
|
|
88
|
-
self._client.put(
|
|
89
|
-
"{}/{}/documents/".format(self.api_path, self.id), json=[]
|
|
90
|
-
)
|
|
81
|
+
self._client.put(f"{self.api_path}/{self.id}/documents/", json=[])
|
|
91
82
|
|
|
92
83
|
def add_documents(self, documents):
|
|
93
84
|
"""Efficient way to bulk add documents to a project"""
|
|
@@ -95,9 +86,7 @@ class Project(BaseAPIObject):
|
|
|
95
86
|
for data_group in grouper(data, BULK_LIMIT):
|
|
96
87
|
# Grouper will put None's on the end of the last group
|
|
97
88
|
data_group = [d for d in data_group if d is not None]
|
|
98
|
-
self._client.patch(
|
|
99
|
-
"{}/{}/documents/".format(self.api_path, self.id), json=data_group
|
|
100
|
-
)
|
|
89
|
+
self._client.patch(f"{self.api_path}/{self.id}/documents/", json=data_group)
|
|
101
90
|
|
|
102
91
|
|
|
103
92
|
class ProjectClient(BaseAPIClient):
|
|
@@ -106,12 +95,12 @@ class ProjectClient(BaseAPIClient):
|
|
|
106
95
|
api_path = "projects"
|
|
107
96
|
resource = Project
|
|
108
97
|
|
|
109
|
-
# all is
|
|
98
|
+
# all is overridden to filter by the current user for backward compatibility
|
|
110
99
|
def all(self, **params):
|
|
111
100
|
return self.list(user=self.client.user_id, **params)
|
|
112
101
|
|
|
113
102
|
def get(self, id=None, title=None):
|
|
114
|
-
# pylint:disable=redefined-builtin, arguments-
|
|
103
|
+
# pylint:disable=redefined-builtin, arguments-renamed
|
|
115
104
|
# pylint disables are necessary for backward compatibility
|
|
116
105
|
if id is not None and title is not None:
|
|
117
106
|
raise ValueError(
|
|
@@ -126,11 +115,11 @@ class ProjectClient(BaseAPIClient):
|
|
|
126
115
|
return self.get_by_title(title)
|
|
127
116
|
|
|
128
117
|
def get_by_id(self, id_):
|
|
129
|
-
return super(
|
|
118
|
+
return super().get(id_)
|
|
130
119
|
|
|
131
120
|
def get_by_title(self, title):
|
|
132
121
|
response = self.client.get(
|
|
133
|
-
self.api_path
|
|
122
|
+
f"{self.api_path}/", params={"title": title, "user": self.client.user_id}
|
|
134
123
|
)
|
|
135
124
|
json = response.json()
|
|
136
125
|
count = len(json["results"])
|
|
@@ -148,7 +137,7 @@ class ProjectClient(BaseAPIClient):
|
|
|
148
137
|
if document_ids:
|
|
149
138
|
data = [{"document": d} for d in document_ids]
|
|
150
139
|
response = self.client.put(
|
|
151
|
-
"{}/{}/documents/"
|
|
140
|
+
f"{self.api_path}/{project.id}/documents/", json=data
|
|
152
141
|
)
|
|
153
142
|
return project
|
|
154
143
|
|
|
@@ -1,26 +1,19 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
|
-
|
|
7
1
|
# Local
|
|
8
2
|
from .base import BaseAPIObject, ChildAPIClient
|
|
9
3
|
from .toolbox import merge_dicts
|
|
10
4
|
|
|
11
5
|
|
|
12
|
-
@python_2_unicode_compatible
|
|
13
6
|
class Section(BaseAPIObject):
|
|
14
7
|
"""A section of a document"""
|
|
15
8
|
|
|
16
9
|
writable_fields = ["page_number", "title"]
|
|
17
10
|
|
|
18
11
|
def __str__(self):
|
|
19
|
-
return "{} - p{
|
|
12
|
+
return f"{self.title} - p{self.page}"
|
|
20
13
|
|
|
21
14
|
@property
|
|
22
15
|
def api_path(self):
|
|
23
|
-
return "documents/{
|
|
16
|
+
return f"documents/{self.document.id}/sections"
|
|
24
17
|
|
|
25
18
|
@property
|
|
26
19
|
def page(self):
|
|
@@ -34,11 +27,11 @@ class SectionClient(ChildAPIClient):
|
|
|
34
27
|
|
|
35
28
|
@property
|
|
36
29
|
def api_path(self):
|
|
37
|
-
return "documents/{
|
|
30
|
+
return f"documents/{self.parent.id}/sections"
|
|
38
31
|
|
|
39
32
|
def create(self, title, page_number):
|
|
40
33
|
data = {"title": title, "page_number": page_number}
|
|
41
|
-
response = self.client.post(self.api_path
|
|
34
|
+
response = self.client.post(f"{self.api_path}/", json=data)
|
|
42
35
|
return Section(
|
|
43
36
|
self.client, merge_dicts(response.json(), {"document": self.parent})
|
|
44
37
|
)
|
|
@@ -1,21 +1,16 @@
|
|
|
1
1
|
"""
|
|
2
2
|
A few toys the API will use.
|
|
3
3
|
"""
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
|
|
5
|
+
# Standard Library
|
|
6
|
+
from itertools import zip_longest
|
|
7
|
+
from urllib.parse import urlparse
|
|
6
8
|
|
|
7
9
|
# Third Party
|
|
8
10
|
import requests
|
|
9
11
|
from requests.adapters import HTTPAdapter
|
|
10
12
|
from urllib3.util.retry import Retry
|
|
11
13
|
|
|
12
|
-
try:
|
|
13
|
-
from urllib.parse import urlparse
|
|
14
|
-
from itertools import zip_longest
|
|
15
|
-
except ImportError:
|
|
16
|
-
from urlparse import urlparse
|
|
17
|
-
from itertools import izip_longest as zip_longest
|
|
18
|
-
|
|
19
14
|
|
|
20
15
|
def requests_retry_session(
|
|
21
16
|
retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None
|
|
@@ -1,14 +1,7 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
# Third Party
|
|
5
|
-
from future.utils import python_2_unicode_compatible
|
|
6
|
-
|
|
7
1
|
# Local
|
|
8
2
|
from .base import BaseAPIClient, BaseAPIObject
|
|
9
3
|
|
|
10
4
|
|
|
11
|
-
@python_2_unicode_compatible
|
|
12
5
|
class User(BaseAPIObject):
|
|
13
6
|
"""A documentcloud user"""
|
|
14
7
|
|
{python-documentcloud-3.8.0 → python-documentcloud-4.0.0/python_documentcloud.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: python-documentcloud
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.0
|
|
4
4
|
Summary: A simple Python wrapper for the DocumentCloud API
|
|
5
5
|
Home-page: https://github.com/muckrock/python-documentcloud
|
|
6
6
|
Author: Mitchell Kotler
|
|
@@ -11,13 +11,12 @@ Classifier: Development Status :: 5 - Production/Stable
|
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Operating System :: OS Independent
|
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
-
Classifier: Programming Language :: Python
|
|
15
|
-
Classifier: Programming Language :: Python :: 2
|
|
16
|
-
Classifier: Programming Language :: Python :: 2.7
|
|
17
|
-
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.6
|
|
19
14
|
Classifier: Programming Language :: Python :: 3.7
|
|
20
15
|
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
20
|
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
21
|
Description-Content-Type: text/markdown
|
|
23
22
|
Provides-Extra: dev
|
|
@@ -7,7 +7,7 @@ with open("README.md", "r") as fh:
|
|
|
7
7
|
|
|
8
8
|
setup(
|
|
9
9
|
name="python-documentcloud",
|
|
10
|
-
version="
|
|
10
|
+
version="4.0.0",
|
|
11
11
|
description="A simple Python wrapper for the DocumentCloud API",
|
|
12
12
|
author="Mitchell Kotler",
|
|
13
13
|
author_email="mitch@muckrock.com",
|
|
@@ -48,13 +48,12 @@ setup(
|
|
|
48
48
|
"Intended Audience :: Developers",
|
|
49
49
|
"Operating System :: OS Independent",
|
|
50
50
|
"License :: OSI Approved :: MIT License",
|
|
51
|
-
"Programming Language :: Python",
|
|
52
|
-
"Programming Language :: Python :: 2",
|
|
53
|
-
"Programming Language :: Python :: 2.7",
|
|
54
|
-
"Programming Language :: Python :: 3",
|
|
55
|
-
"Programming Language :: Python :: 3.6",
|
|
56
51
|
"Programming Language :: Python :: 3.7",
|
|
57
52
|
"Programming Language :: Python :: 3.8",
|
|
53
|
+
"Programming Language :: Python :: 3.9",
|
|
54
|
+
"Programming Language :: Python :: 3.10",
|
|
55
|
+
"Programming Language :: Python :: 3.11",
|
|
56
|
+
"Programming Language :: Python :: 3.12",
|
|
58
57
|
"Topic :: Internet :: WWW/HTTP",
|
|
59
58
|
),
|
|
60
59
|
)
|
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
# Future
|
|
2
|
-
from __future__ import division, print_function, unicode_literals
|
|
3
|
-
|
|
4
|
-
PER_PAGE_MAX = 100
|
|
5
|
-
BULK_LIMIT = 25
|
|
6
|
-
BASE_URI = "https://api.www.documentcloud.org/api/"
|
|
7
|
-
AUTH_URI = "https://accounts.muckrock.com/api/"
|
|
8
|
-
TIMEOUT = 20
|
|
9
|
-
RATE_LIMIT = 10
|
|
10
|
-
RATE_PERIOD = 1
|
|
11
|
-
SUPPORTED_EXTENSIONS = [
|
|
12
|
-
".abw",
|
|
13
|
-
".zabw",
|
|
14
|
-
".md",
|
|
15
|
-
".pm3",
|
|
16
|
-
".pm4",
|
|
17
|
-
".pm5",
|
|
18
|
-
".pm6",
|
|
19
|
-
".p65",
|
|
20
|
-
".cwk",
|
|
21
|
-
".agd",
|
|
22
|
-
".fhd",
|
|
23
|
-
".kth",
|
|
24
|
-
".key",
|
|
25
|
-
".numbers",
|
|
26
|
-
".pages",
|
|
27
|
-
".bmp",
|
|
28
|
-
".csv",
|
|
29
|
-
".txt",
|
|
30
|
-
".cdr",
|
|
31
|
-
".cmx",
|
|
32
|
-
".cgm",
|
|
33
|
-
".dif",
|
|
34
|
-
".dbf",
|
|
35
|
-
".xml",
|
|
36
|
-
".eps",
|
|
37
|
-
".emf",
|
|
38
|
-
".fb2",
|
|
39
|
-
".gnm",
|
|
40
|
-
".gnumeric",
|
|
41
|
-
".gif",
|
|
42
|
-
".hwp",
|
|
43
|
-
".plt",
|
|
44
|
-
".html",
|
|
45
|
-
".htm",
|
|
46
|
-
".jtd",
|
|
47
|
-
".jtt",
|
|
48
|
-
".jpg",
|
|
49
|
-
".jpeg",
|
|
50
|
-
".wk1",
|
|
51
|
-
".wks",
|
|
52
|
-
".123",
|
|
53
|
-
".wk3",
|
|
54
|
-
".wk4",
|
|
55
|
-
".pct",
|
|
56
|
-
".mml",
|
|
57
|
-
".xls",
|
|
58
|
-
".xlw",
|
|
59
|
-
".xlt",
|
|
60
|
-
".xlsx",
|
|
61
|
-
".docx",
|
|
62
|
-
".pptx",
|
|
63
|
-
".ppt",
|
|
64
|
-
".pps",
|
|
65
|
-
".pot",
|
|
66
|
-
".pptx",
|
|
67
|
-
".pub",
|
|
68
|
-
".rtf",
|
|
69
|
-
".xml",
|
|
70
|
-
".doc",
|
|
71
|
-
".dot",
|
|
72
|
-
".docx",
|
|
73
|
-
".wps",
|
|
74
|
-
".wks",
|
|
75
|
-
".wdb",
|
|
76
|
-
".wri",
|
|
77
|
-
".vsd",
|
|
78
|
-
".pgm",
|
|
79
|
-
".pbm",
|
|
80
|
-
".ppm",
|
|
81
|
-
".odt",
|
|
82
|
-
".fodt",
|
|
83
|
-
".ods",
|
|
84
|
-
".fods",
|
|
85
|
-
".odp",
|
|
86
|
-
".fodp",
|
|
87
|
-
".odg",
|
|
88
|
-
".fodg",
|
|
89
|
-
".odf",
|
|
90
|
-
".odb",
|
|
91
|
-
".sxw",
|
|
92
|
-
".stw",
|
|
93
|
-
".sxc",
|
|
94
|
-
".stc",
|
|
95
|
-
".sxi",
|
|
96
|
-
".sti",
|
|
97
|
-
".sxd",
|
|
98
|
-
".std",
|
|
99
|
-
".sxm",
|
|
100
|
-
".pcx",
|
|
101
|
-
".pcd",
|
|
102
|
-
".psd",
|
|
103
|
-
".pdf",
|
|
104
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python-documentcloud-3.8.0 → python-documentcloud-4.0.0}/python_documentcloud.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|