datamaestro 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/context.py +18 -9
- datamaestro/data/ml.py +17 -9
- datamaestro/definitions.py +58 -18
- datamaestro/download/__init__.py +31 -2
- datamaestro/download/wayback.py +163 -0
- datamaestro/record.py +14 -1
- datamaestro/version.py +2 -2
- {datamaestro-1.1.0.dist-info → datamaestro-1.2.1.dist-info}/METADATA +1 -1
- {datamaestro-1.1.0.dist-info → datamaestro-1.2.1.dist-info}/RECORD +14 -13
- {datamaestro-1.1.0.dist-info → datamaestro-1.2.1.dist-info}/WHEEL +1 -1
- {datamaestro-1.1.0.dist-info → datamaestro-1.2.1.dist-info}/LICENSE +0 -0
- {datamaestro-1.1.0.dist-info → datamaestro-1.2.1.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.1.0.dist-info → datamaestro-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
from datamaestro.definitions import
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from datamaestro.definitions import AbstractDataset, hook
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
@hook("pre-use")
|
|
6
|
-
def useragreement(definition: AbstractDataset, message, id=None):
|
|
6
|
+
def useragreement(definition: AbstractDataset, message: str, id: Optional[str] = None):
|
|
7
|
+
"""Asks for a user-agreement
|
|
8
|
+
|
|
9
|
+
:param definition: The dataset for which the agreement is asked
|
|
10
|
+
:param message: The agreement text
|
|
11
|
+
:param id: The ID of the agreement (default to the dataset ID)
|
|
12
|
+
"""
|
|
7
13
|
# Skip agreement when testing
|
|
8
14
|
if definition.context.running_test:
|
|
9
15
|
return
|
datamaestro/context.py
CHANGED
|
@@ -110,19 +110,20 @@ class Context:
|
|
|
110
110
|
if repositoryid is None:
|
|
111
111
|
return None
|
|
112
112
|
|
|
113
|
-
|
|
113
|
+
entry_points = [
|
|
114
114
|
x
|
|
115
115
|
for x in pkg_resources.iter_entry_points(
|
|
116
116
|
"datamaestro.repositories", repositoryid
|
|
117
117
|
)
|
|
118
118
|
]
|
|
119
|
-
if not
|
|
119
|
+
if not entry_points:
|
|
120
120
|
raise Exception("No datasets repository named %s", repositoryid)
|
|
121
|
-
if len(
|
|
121
|
+
if len(entry_points) > 1:
|
|
122
122
|
raise Exception(
|
|
123
|
-
"Too many datasets repository named %s (%d)"
|
|
123
|
+
"Too many datasets repository named %s (%d)"
|
|
124
|
+
% (repositoryid, len(entry_points))
|
|
124
125
|
)
|
|
125
|
-
return
|
|
126
|
+
return entry_points[0].load()(self)
|
|
126
127
|
|
|
127
128
|
@property
|
|
128
129
|
def running_test(self):
|
|
@@ -175,7 +176,6 @@ class Context:
|
|
|
175
176
|
if dlpath.is_file():
|
|
176
177
|
logging.debug("Using cached file %s for %s", dlpath, url)
|
|
177
178
|
else:
|
|
178
|
-
|
|
179
179
|
logging.info("Downloading %s", url)
|
|
180
180
|
tmppath = dlpath.with_suffix(".tmp")
|
|
181
181
|
|
|
@@ -188,7 +188,7 @@ class Context:
|
|
|
188
188
|
|
|
189
189
|
def ask(self, question: str, options: Dict[str, str]):
|
|
190
190
|
"""Ask a question to the user"""
|
|
191
|
-
print(question)
|
|
191
|
+
print(question) # noqa: T201
|
|
192
192
|
answer = None
|
|
193
193
|
while answer not in options:
|
|
194
194
|
answer = input().strip().lower()
|
|
@@ -268,6 +268,7 @@ class Datasets(Iterable["AbstractDataset"]):
|
|
|
268
268
|
|
|
269
269
|
def __iter__(self) -> Iterable["AbstractDataset"]:
|
|
270
270
|
from .definitions import DatasetWrapper
|
|
271
|
+
from datamaestro.data import Base
|
|
271
272
|
|
|
272
273
|
# Iterates over defined symbols
|
|
273
274
|
for key, value in self.module.__dict__.items():
|
|
@@ -276,10 +277,18 @@ class Datasets(Iterable["AbstractDataset"]):
|
|
|
276
277
|
# Ensure it comes from the module
|
|
277
278
|
if self.module.__name__ == value.t.__module__:
|
|
278
279
|
yield value
|
|
280
|
+
elif (
|
|
281
|
+
inspect.isclass(value)
|
|
282
|
+
and issubclass(value, Base)
|
|
283
|
+
and hasattr(value, "__dataset__")
|
|
284
|
+
):
|
|
285
|
+
if self.module.__name__ == value.__module__:
|
|
286
|
+
yield value.__dataset__
|
|
279
287
|
|
|
280
288
|
|
|
281
289
|
class Repository:
|
|
282
|
-
"""A repository regroup a set of datasets and their corresponding specific
|
|
290
|
+
"""A repository regroup a set of datasets and their corresponding specific
|
|
291
|
+
handlers (downloading, filtering, etc.)"""
|
|
283
292
|
|
|
284
293
|
def __init__(self, context: Context):
|
|
285
294
|
"""Initialize a new repository
|
|
@@ -315,7 +324,7 @@ class Repository:
|
|
|
315
324
|
try:
|
|
316
325
|
return get_distribution(cls.__module__).version
|
|
317
326
|
except DistributionNotFound:
|
|
318
|
-
|
|
327
|
+
return None
|
|
319
328
|
|
|
320
329
|
def __repr__(self):
|
|
321
330
|
return "Repository(%s)" % self.basedir
|
datamaestro/data/ml.py
CHANGED
|
@@ -1,19 +1,27 @@
|
|
|
1
1
|
"""Machine learning generic data formats"""
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Generic, TypeVar, Optional
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from
|
|
4
|
+
from experimaestro import Param, Meta, argument
|
|
5
|
+
from . import Base
|
|
5
6
|
|
|
7
|
+
Train = TypeVar("Train", bound=Base)
|
|
8
|
+
Validation = TypeVar("Validation", bound=Base)
|
|
9
|
+
Test = TypeVar("Test", bound=Base)
|
|
6
10
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
11
|
+
|
|
12
|
+
class Supervised(Base, Generic[Train, Validation, Test]):
|
|
13
|
+
train: Param[Base]
|
|
14
|
+
"""The training dataset"""
|
|
15
|
+
|
|
16
|
+
validation: Param[Optional[Base]] = None
|
|
17
|
+
"""The validation dataset (optional)"""
|
|
18
|
+
|
|
19
|
+
test: Param[Optional[Base]] = None
|
|
20
|
+
"""The training optional"""
|
|
12
21
|
|
|
13
22
|
|
|
14
|
-
@argument("path", type=Path)
|
|
15
23
|
@argument("classes")
|
|
16
24
|
class FolderBased(Base):
|
|
17
25
|
"""Classification dataset where folders give the basis"""
|
|
18
26
|
|
|
19
|
-
|
|
27
|
+
path: Meta[Path]
|
datamaestro/definitions.py
CHANGED
|
@@ -127,6 +127,13 @@ class AbstractDataset(AbstractData):
|
|
|
127
127
|
"""
|
|
128
128
|
|
|
129
129
|
name: Optional[str] = None
|
|
130
|
+
"""The name of the dataset"""
|
|
131
|
+
|
|
132
|
+
url: Optional[str] = None
|
|
133
|
+
"""The URL of the dataset"""
|
|
134
|
+
|
|
135
|
+
doi: Optional[str] = None
|
|
136
|
+
"""The DOI of this dataset"""
|
|
130
137
|
|
|
131
138
|
def __init__(self, repository: Optional["Repository"]):
|
|
132
139
|
super().__init__()
|
|
@@ -136,6 +143,7 @@ class AbstractDataset(AbstractData):
|
|
|
136
143
|
|
|
137
144
|
# Associated resources
|
|
138
145
|
self.resources: Dict[str, "Download"] = {}
|
|
146
|
+
self.ordered_resources = []
|
|
139
147
|
|
|
140
148
|
# Hooks
|
|
141
149
|
# pre-use: before returning the dataset object
|
|
@@ -194,13 +202,15 @@ class AbstractDataset(AbstractData):
|
|
|
194
202
|
def download(self, force=False):
|
|
195
203
|
"""Download all the necessary resources"""
|
|
196
204
|
success = True
|
|
197
|
-
|
|
205
|
+
logging.info("Materializing %d resources", len(self.ordered_resources))
|
|
206
|
+
for resource in self.ordered_resources:
|
|
198
207
|
try:
|
|
199
208
|
resource.download(force)
|
|
200
209
|
except Exception:
|
|
201
|
-
logging.error("Could not download resource %s",
|
|
210
|
+
logging.error("Could not download resource %s", resource)
|
|
202
211
|
traceback.print_exc()
|
|
203
212
|
success = False
|
|
213
|
+
break
|
|
204
214
|
return success
|
|
205
215
|
|
|
206
216
|
@staticmethod
|
|
@@ -249,6 +259,7 @@ class DatasetWrapper(AbstractDataset):
|
|
|
249
259
|
def __init__(self, annotation, t: type):
|
|
250
260
|
self.t = t
|
|
251
261
|
self.base = annotation.base
|
|
262
|
+
self.config = None
|
|
252
263
|
assert self.base is not None, f"Could not set the Config type for {t}"
|
|
253
264
|
|
|
254
265
|
repository, components = DataDefinition.repository_relpath(t)
|
|
@@ -256,6 +267,7 @@ class DatasetWrapper(AbstractDataset):
|
|
|
256
267
|
|
|
257
268
|
# Set some variables
|
|
258
269
|
self.url = annotation.url
|
|
270
|
+
self.doi = annotation.doi
|
|
259
271
|
|
|
260
272
|
# Builds the ID:
|
|
261
273
|
# Removes module_name.config prefix
|
|
@@ -322,7 +334,18 @@ class DatasetWrapper(AbstractDataset):
|
|
|
322
334
|
"""Returns a pointer to a potential attribute"""
|
|
323
335
|
return FutureAttr(self, [key])
|
|
324
336
|
|
|
337
|
+
def download(self, force=False):
|
|
338
|
+
if self.base is self.t:
|
|
339
|
+
self._prepare()
|
|
340
|
+
return super().download(force=force)
|
|
341
|
+
|
|
325
342
|
def _prepare(self, download=False) -> "Base":
|
|
343
|
+
if self.config is not None:
|
|
344
|
+
return self.config
|
|
345
|
+
|
|
346
|
+
if self.base is self.t:
|
|
347
|
+
self.config = self.base.__create_dataset__(self)
|
|
348
|
+
|
|
326
349
|
if download:
|
|
327
350
|
for hook in self.hooks["pre-download"]:
|
|
328
351
|
hook(self)
|
|
@@ -332,23 +355,23 @@ class DatasetWrapper(AbstractDataset):
|
|
|
332
355
|
for hook in self.hooks["pre-use"]:
|
|
333
356
|
hook(self)
|
|
334
357
|
|
|
335
|
-
resources = {key: value.prepare() for key, value in self.resources.items()}
|
|
336
|
-
dict = self.t(**resources)
|
|
337
|
-
if dict is None:
|
|
338
|
-
name = self.t.__name__
|
|
339
|
-
filename = inspect.getfile(self.t)
|
|
340
|
-
raise Exception(
|
|
341
|
-
f"The dataset method {name} defined in "
|
|
342
|
-
f"{filename} returned a null object"
|
|
343
|
-
)
|
|
344
|
-
|
|
345
358
|
# Construct the object
|
|
346
|
-
|
|
359
|
+
if self.config is None:
|
|
360
|
+
resources = {key: value.prepare() for key, value in self.resources.items()}
|
|
361
|
+
dict = self.t(**resources)
|
|
362
|
+
if dict is None:
|
|
363
|
+
name = self.t.__name__
|
|
364
|
+
filename = inspect.getfile(self.t)
|
|
365
|
+
raise Exception(
|
|
366
|
+
f"The dataset method {name} defined in "
|
|
367
|
+
f"{filename} returned a null object"
|
|
368
|
+
)
|
|
369
|
+
self.config = self.base(**dict)
|
|
347
370
|
|
|
348
371
|
# Set the ids
|
|
349
|
-
self.setDataIDs(
|
|
372
|
+
self.setDataIDs(self.config, self.id)
|
|
350
373
|
|
|
351
|
-
return
|
|
374
|
+
return self.config
|
|
352
375
|
|
|
353
376
|
@property
|
|
354
377
|
def _path(self) -> Path:
|
|
@@ -455,7 +478,9 @@ datatasks = DataTagging(lambda d: d.tasks)
|
|
|
455
478
|
|
|
456
479
|
|
|
457
480
|
class dataset:
|
|
458
|
-
def __init__(
|
|
481
|
+
def __init__(
|
|
482
|
+
self, base=None, *, timestamp=None, id=None, url=None, size=None, doi=None
|
|
483
|
+
):
|
|
459
484
|
"""Creates a new (meta)dataset
|
|
460
485
|
|
|
461
486
|
Meta-datasets are not associated with any base type
|
|
@@ -473,6 +498,8 @@ class dataset:
|
|
|
473
498
|
url {[type]} -- [description] (default: {None})
|
|
474
499
|
|
|
475
500
|
size {str} -- The size (should be a parsable format)
|
|
501
|
+
|
|
502
|
+
doi {str} -- The DOI of the corresponding paper
|
|
476
503
|
"""
|
|
477
504
|
if hasattr(base, "__datamaestro__") and isinstance(
|
|
478
505
|
base.__datamaestro__, metadataset
|
|
@@ -486,18 +513,31 @@ class dataset:
|
|
|
486
513
|
self.meta = False
|
|
487
514
|
self.timestamp = timestamp
|
|
488
515
|
self.size = size
|
|
516
|
+
self.doi = doi
|
|
489
517
|
|
|
490
518
|
def __call__(self, t):
|
|
491
519
|
try:
|
|
492
520
|
if self.base is None:
|
|
493
|
-
|
|
494
|
-
|
|
521
|
+
from datamaestro.data import Base
|
|
522
|
+
|
|
523
|
+
if inspect.isclass(t) and issubclass(t, Base):
|
|
524
|
+
self.base = t
|
|
525
|
+
else:
|
|
526
|
+
# Get type from return annotation
|
|
527
|
+
try:
|
|
528
|
+
self.base = t.__annotations__["return"]
|
|
529
|
+
except KeyError:
|
|
530
|
+
logging.warning("No return annotation in %s", t)
|
|
531
|
+
raise
|
|
495
532
|
object.__getattribute__(t, "__datamaestro__")
|
|
496
533
|
raise AssertionError("@data should only be called once")
|
|
497
534
|
except AttributeError:
|
|
498
535
|
pass
|
|
499
536
|
|
|
500
537
|
dw = DatasetWrapper(self, t)
|
|
538
|
+
t.__dataset__ = dw
|
|
539
|
+
if inspect.isclass(t) and issubclass(t, Base):
|
|
540
|
+
return t
|
|
501
541
|
return dw
|
|
502
542
|
|
|
503
543
|
|
datamaestro/download/__init__.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
from typing import Union
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from datamaestro.definitions import AbstractDataset, DatasetAnnotation
|
|
3
4
|
from datamaestro.utils import deprecated
|
|
5
|
+
from attrs import define
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
def initialized(method):
|
|
@@ -15,7 +17,12 @@ def initialized(method):
|
|
|
15
17
|
return wrapper
|
|
16
18
|
|
|
17
19
|
|
|
18
|
-
|
|
20
|
+
@define(kw_only=True)
|
|
21
|
+
class SetupOptions:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Resource(DatasetAnnotation, ABC):
|
|
19
26
|
"""
|
|
20
27
|
Base class for all download handlers
|
|
21
28
|
"""
|
|
@@ -24,13 +31,16 @@ class Download(DatasetAnnotation, ABC):
|
|
|
24
31
|
self.varname = varname
|
|
25
32
|
# Ensures that the object is initialized
|
|
26
33
|
self._post = False
|
|
34
|
+
self.definition = None
|
|
27
35
|
|
|
28
36
|
def annotate(self, dataset: AbstractDataset):
|
|
37
|
+
assert self.definition is None
|
|
29
38
|
# Register has a resource download
|
|
30
39
|
if self.varname in dataset.resources:
|
|
31
40
|
raise AssertionError("Name %s already declared as a resource", self.varname)
|
|
32
41
|
|
|
33
42
|
dataset.resources[self.varname] = self
|
|
43
|
+
dataset.ordered_resources.append(self)
|
|
34
44
|
self.definition = dataset
|
|
35
45
|
|
|
36
46
|
@property
|
|
@@ -53,10 +63,29 @@ class Download(DatasetAnnotation, ABC):
|
|
|
53
63
|
"""Prepares the dataset"""
|
|
54
64
|
...
|
|
55
65
|
|
|
66
|
+
def setup(
|
|
67
|
+
self,
|
|
68
|
+
dataset: Union[AbstractDataset],
|
|
69
|
+
options: SetupOptions = None,
|
|
70
|
+
):
|
|
71
|
+
"""Direct way to setup the resource (no annotation)"""
|
|
72
|
+
self(dataset)
|
|
73
|
+
return self.prepare()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Keeps downwards compatibility
|
|
77
|
+
Download = Resource
|
|
78
|
+
|
|
56
79
|
|
|
57
80
|
class reference(Download):
|
|
58
|
-
def __init__(self, varname, reference):
|
|
81
|
+
def __init__(self, varname=None, reference=None):
|
|
82
|
+
"""References another dataset
|
|
83
|
+
|
|
84
|
+
:param varname: The name of the variable
|
|
85
|
+
:param reference: Another dataset
|
|
86
|
+
"""
|
|
59
87
|
super().__init__(varname)
|
|
88
|
+
assert reference is not None, "Reference cannot be null"
|
|
60
89
|
self.reference = reference
|
|
61
90
|
|
|
62
91
|
def prepare(self):
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import json
|
|
3
|
+
from datamaestro.download import Resource
|
|
4
|
+
from typing import Callable, Iterator
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import requests
|
|
7
|
+
import random
|
|
8
|
+
import re
|
|
9
|
+
from requests.exceptions import HTTPError
|
|
10
|
+
from tqdm.auto import tqdm
|
|
11
|
+
import time
|
|
12
|
+
import urllib.parse
|
|
13
|
+
import uuid
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
wayback_prefix = re.compile(r"^https:\/\/web\.archive\.org\/web")
|
|
17
|
+
replace_pattern = re.compile(r"(web\.archive\.org\/web\/\d+)")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def download_with_retry(url: str, max_retries: int = 10) -> requests.Response:
|
|
21
|
+
"""Download a URL with exponential backoff, until max_retries is reached."""
|
|
22
|
+
retry_num = 0
|
|
23
|
+
while True:
|
|
24
|
+
try:
|
|
25
|
+
response = requests.get(url)
|
|
26
|
+
response.raise_for_status()
|
|
27
|
+
return response
|
|
28
|
+
except HTTPError as e:
|
|
29
|
+
status_code = e.response.status_code
|
|
30
|
+
if not (status_code == 429 or status_code >= 500):
|
|
31
|
+
# This is not an error we should retry on
|
|
32
|
+
raise e
|
|
33
|
+
|
|
34
|
+
if retry_num > max_retries:
|
|
35
|
+
logging.error(
|
|
36
|
+
f"Failed to perform GET request on {url}"
|
|
37
|
+
f"after {max_retries} retries."
|
|
38
|
+
)
|
|
39
|
+
raise e
|
|
40
|
+
|
|
41
|
+
if status_code == 429:
|
|
42
|
+
time.sleep(5 + 2**retry_num + random.randint(0, 1000) / 1000)
|
|
43
|
+
else:
|
|
44
|
+
time.sleep(2**retry_num + random.randint(0, 1000) / 1000)
|
|
45
|
+
retry_num += 1
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def download_link(link: str, timestamp: str):
|
|
49
|
+
page_id = str(uuid.uuid4())
|
|
50
|
+
url_no_header = None
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
# Find the Wayback Machine link
|
|
54
|
+
if not wayback_prefix.match(link):
|
|
55
|
+
link_encoded = urllib.parse.quote(link)
|
|
56
|
+
|
|
57
|
+
available, availability_attempt = False, 0
|
|
58
|
+
# Sometimes the API returns HTTP success code 200, but archived
|
|
59
|
+
# snapshots shows page is unavailable when it actually is. Give it a
|
|
60
|
+
# total of three tries.
|
|
61
|
+
while not available and availability_attempt < 3:
|
|
62
|
+
response = download_with_retry(
|
|
63
|
+
"http://archive.org/wayback/available?"
|
|
64
|
+
f"url={link_encoded}×tamp={timestamp}"
|
|
65
|
+
)
|
|
66
|
+
json_response = response.json()
|
|
67
|
+
available = "closest" in json_response["archived_snapshots"]
|
|
68
|
+
availability_attempt += 1
|
|
69
|
+
|
|
70
|
+
if not available:
|
|
71
|
+
logging.warning(
|
|
72
|
+
f"Not available on Wayback Machine: {link}, "
|
|
73
|
+
f"HTTP code {response.status_code}, {json_response}"
|
|
74
|
+
)
|
|
75
|
+
return {"link": link, "page_id": page_id, "available": False}
|
|
76
|
+
|
|
77
|
+
url = json_response["archived_snapshots"]["closest"]["url"]
|
|
78
|
+
else:
|
|
79
|
+
url = link
|
|
80
|
+
|
|
81
|
+
match = replace_pattern.search(url)
|
|
82
|
+
assert match
|
|
83
|
+
url_no_header = replace_pattern.sub(f"{match.group(1)}id_", url)
|
|
84
|
+
|
|
85
|
+
response = download_with_retry(url_no_header)
|
|
86
|
+
html_page = response.text
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
"link": link,
|
|
90
|
+
"id": url_no_header,
|
|
91
|
+
"contents": html_page,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
except HTTPError as http_err:
|
|
95
|
+
logging.warning(f"HTTP error occurred: {http_err} for {link}")
|
|
96
|
+
return {
|
|
97
|
+
"link": link,
|
|
98
|
+
"page_id": page_id,
|
|
99
|
+
"available": False,
|
|
100
|
+
"status_code": http_err.response.status_code if http_err.response else None,
|
|
101
|
+
"wayback_url": url_no_header,
|
|
102
|
+
}
|
|
103
|
+
except UnicodeDecodeError as e:
|
|
104
|
+
logging.warning(f"Unicode decode error occurred: {e} for {link}")
|
|
105
|
+
return {
|
|
106
|
+
"link": link,
|
|
107
|
+
"page_id": page_id,
|
|
108
|
+
"available": False,
|
|
109
|
+
"status_code": response.status_code,
|
|
110
|
+
"wayback_url": url_no_header,
|
|
111
|
+
}
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logging.warning(f"Exception occurred: {e} for {link}")
|
|
114
|
+
return {
|
|
115
|
+
"link": link,
|
|
116
|
+
"page_id": page_id,
|
|
117
|
+
"available": False,
|
|
118
|
+
"status_code": None,
|
|
119
|
+
"wayback_url": url_no_header,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class wayback_documents(Resource):
|
|
124
|
+
"""Collect documents from wayback"""
|
|
125
|
+
|
|
126
|
+
def __init__(self, timestamp: str, urls_fn: Callable[[], Iterator[str]], name=None):
|
|
127
|
+
super().__init__(name)
|
|
128
|
+
self.timestamp = timestamp
|
|
129
|
+
self.urls_fn = urls_fn
|
|
130
|
+
|
|
131
|
+
def prepare(self):
|
|
132
|
+
return self.definition.datapath / self.varname
|
|
133
|
+
|
|
134
|
+
def download(self, force=False):
|
|
135
|
+
# Creates directory if needed
|
|
136
|
+
destination: Path = self.definition.datapath / self.varname
|
|
137
|
+
self.definition.datapath.mkdir(exist_ok=True)
|
|
138
|
+
|
|
139
|
+
# Early exit
|
|
140
|
+
done_path = destination.with_suffix(".done")
|
|
141
|
+
if done_path.is_file() and not force:
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
# Reads the URLs
|
|
145
|
+
logging.info("Retrieving URLs from wayback")
|
|
146
|
+
pos = 0
|
|
147
|
+
urls = set()
|
|
148
|
+
with destination.open("at+") as fp:
|
|
149
|
+
fp.seek(0)
|
|
150
|
+
try:
|
|
151
|
+
for line in fp:
|
|
152
|
+
pos = fp.tell()
|
|
153
|
+
urls.add(json.loads(line)["url"])
|
|
154
|
+
except json.JSONDecodeError:
|
|
155
|
+
logging.warning(f"JSON decoding error: getting back to position {pos}")
|
|
156
|
+
fp.seek(pos)
|
|
157
|
+
|
|
158
|
+
# Get the remaining ones
|
|
159
|
+
for url in tqdm(self.urls_fn()):
|
|
160
|
+
fp.write(json.dumps(download_link(url, self.timestamp)))
|
|
161
|
+
|
|
162
|
+
# Everything is fine
|
|
163
|
+
done_path.touch()
|
datamaestro/record.py
CHANGED
|
@@ -129,7 +129,20 @@ class Record:
|
|
|
129
129
|
def __str__(self):
|
|
130
130
|
return (
|
|
131
131
|
"{"
|
|
132
|
-
+ ", ".join(
|
|
132
|
+
+ ", ".join(
|
|
133
|
+
f"{key.__module__}.{key.__qualname__}: {value}"
|
|
134
|
+
for key, value in self.items.items()
|
|
135
|
+
)
|
|
136
|
+
+ "}"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def __repr__(self):
|
|
140
|
+
return (
|
|
141
|
+
"{"
|
|
142
|
+
+ ", ".join(
|
|
143
|
+
f"{key.__module__}.{key.__qualname__}: {repr(value)}"
|
|
144
|
+
for key, value in self.items.items()
|
|
145
|
+
)
|
|
133
146
|
+ "}"
|
|
134
147
|
)
|
|
135
148
|
|
datamaestro/version.py
CHANGED
|
@@ -1,25 +1,25 @@
|
|
|
1
1
|
datamaestro/__init__.py,sha256=9M5hA6FVngduJBcjInvJWQM8n0cqapXAFPzfRLHR74c,237
|
|
2
2
|
datamaestro/__main__.py,sha256=tJTf1sTWKRIatvBcHlWDIZRZodAZ2B2zkD01pD89MYk,9024
|
|
3
|
-
datamaestro/context.py,sha256=
|
|
4
|
-
datamaestro/definitions.py,sha256=
|
|
5
|
-
datamaestro/record.py,sha256=
|
|
3
|
+
datamaestro/context.py,sha256=8U5EYEdc9xcHnZFFk4PCZttxxGsmlzRVR8rLBy2zVBw,13605
|
|
4
|
+
datamaestro/definitions.py,sha256=mBoLgrbO1eHVcqMPkb4lxadNdgSsy_w355nZofvBoF8,16732
|
|
5
|
+
datamaestro/record.py,sha256=m3WGsPcZ1LouQXNJOBUK3QusAIRiuy6T_oqhq09-Ckg,5504
|
|
6
6
|
datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
|
|
7
7
|
datamaestro/search.py,sha256=PMceNp5hcp0dlzs4cLb6LJT7XHrdXo58oO7oTucawbE,2887
|
|
8
8
|
datamaestro/settings.py,sha256=HYSElTUYZ6DZocBb9o3ifm6WW9knRO64XJUwxGIpvwQ,1304
|
|
9
9
|
datamaestro/sphinx.py,sha256=bp7x_2BFoTSwTqcVZDM8R8cWa7G2pz0Zb8GS054lLYM,6996
|
|
10
10
|
datamaestro/utils.py,sha256=Y3_aqeOHW8vuifwggGWJfgONyDG1FLX7ONAnX85jENI,6511
|
|
11
|
-
datamaestro/version.py,sha256=
|
|
11
|
+
datamaestro/version.py,sha256=2U0Gn26fYI3Vgj5hgkLM8I3wI6YEVdffJGllaVW-sSc,411
|
|
12
12
|
datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
|
|
13
|
-
datamaestro/annotations/agreement.py,sha256=
|
|
13
|
+
datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
|
|
14
14
|
datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
datamaestro/commands/mainstyle.css,sha256=EAWq6hKWjLYZ-gUrGV-z3L8LtkubD7mLoYdSIC7kLOo,465
|
|
16
16
|
datamaestro/commands/site.py,sha256=nnz4tOwKcgUmsLfPcQVo2SgFIC3OShYfJ8S2N6vuzAw,14173
|
|
17
17
|
datamaestro/data/__init__.py,sha256=vOedQsnYtxI2yj-M2nm32eHpIu9S_WRzfA3futlHNs4,1412
|
|
18
18
|
datamaestro/data/csv.py,sha256=-UXjEbKPvhhZ9_MdYnxUsD8Zsz2t4ZFbserFuHak8pw,2515
|
|
19
19
|
datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
|
|
20
|
-
datamaestro/data/ml.py,sha256=
|
|
20
|
+
datamaestro/data/ml.py,sha256=guh1bxi7Dl3SajJdtBFrtPh6K8eNKiMkBKmBeKGuW5U,710
|
|
21
21
|
datamaestro/data/tensor.py,sha256=OVzV1krIRslui8REdl7hPFu3AXlUyDxf5yUZlbNYsz8,2001
|
|
22
|
-
datamaestro/download/__init__.py,sha256=
|
|
22
|
+
datamaestro/download/__init__.py,sha256=Iqz7zEzeTsBWzE_6bpurhZVtzRjyXVUwCY6MEVjJpO0,2592
|
|
23
23
|
datamaestro/download/archive.py,sha256=G-2gzepknqT7Us3naMGAApGVGJMeHQIxM-tSpaa9ark,5608
|
|
24
24
|
datamaestro/download/huggingface.py,sha256=LkzmZo2Z0yccqAfj7di7jDNGFrMKN9m8IM8SfexOomY,1125
|
|
25
25
|
datamaestro/download/links.py,sha256=GFnq_AzI_uen7JBuGWD9qveeC9QFBWDrSnj7pOcwWwM,3352
|
|
@@ -28,6 +28,7 @@ datamaestro/download/multiple.py,sha256=Mrr0ObHM5cE1CPSHE9PKIrox3qZVgxwRyxLzNXp0
|
|
|
28
28
|
datamaestro/download/single.py,sha256=QSEviTP9lHLh3ZGyo_KoW3ro8UvWCGNPHeZiNj-9rLA,4134
|
|
29
29
|
datamaestro/download/sync.py,sha256=Z_LsXj4kbZWIYKTVJZEhfdpYiv6wXOOIyw8LahmEcqs,836
|
|
30
30
|
datamaestro/download/todo.py,sha256=y3YnmWC_i-u23ce-vreIwIXZcoO-uA0HXErgJPThnco,256
|
|
31
|
+
datamaestro/download/wayback.py,sha256=B9X1P9jElvd_qnUs9aX0TAO-NrNyvuHLYDAcpNq354w,5430
|
|
31
32
|
datamaestro/stream/__init__.py,sha256=Angu_Yg9rNKXb8s4at-DXYcnE-OTgSMLfUEfrL6APD8,896
|
|
32
33
|
datamaestro/stream/compress.py,sha256=0ViFGpJc6pdvZGUNERE-3XV8jAOTSvhJurb2t0NW2eU,260
|
|
33
34
|
datamaestro/stream/lines.py,sha256=UNGcyZlZxN0Q7kw717jbhZFdDVmtfJfkJZCgK7xzF9A,1996
|
|
@@ -38,9 +39,9 @@ datamaestro/test/conftest.py,sha256=it4S5Qq1CA_U8qM0pr4m7v-1dhLj5Y49WjVg5Ee3mpM,
|
|
|
38
39
|
datamaestro/test/test_annotations.py,sha256=kRPUmS_UAN6JSSVPUwV4OM_LEuEUHF1OcLSiYXjsKjw,246
|
|
39
40
|
datamaestro/test/test_download_handlers.py,sha256=Qqm-fML1KVp6dPwAUcH6xzi_dpQIshvROzviSYCUzc0,603
|
|
40
41
|
datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
|
|
41
|
-
datamaestro-1.1.
|
|
42
|
-
datamaestro-1.1.
|
|
43
|
-
datamaestro-1.1.
|
|
44
|
-
datamaestro-1.1.
|
|
45
|
-
datamaestro-1.1.
|
|
46
|
-
datamaestro-1.1.
|
|
42
|
+
datamaestro-1.2.1.dist-info/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
|
|
43
|
+
datamaestro-1.2.1.dist-info/METADATA,sha256=2_TL_ysMtfV2a84_0Uu3UQloCHCvetGZWo5tcjdhNCA,8999
|
|
44
|
+
datamaestro-1.2.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
45
|
+
datamaestro-1.2.1.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
|
|
46
|
+
datamaestro-1.2.1.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
|
|
47
|
+
datamaestro-1.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|