datamaestro 1.2.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {datamaestro-1.2.0 → datamaestro-1.2.1}/PKG-INFO +1 -1
  2. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/api/download.rst +4 -0
  3. datamaestro-1.2.1/src/datamaestro/download/wayback.py +163 -0
  4. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/version.py +2 -2
  5. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro.egg-info/PKG-INFO +1 -1
  6. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro.egg-info/SOURCES.txt +1 -0
  7. {datamaestro-1.2.0 → datamaestro-1.2.1}/.coverage +0 -0
  8. {datamaestro-1.2.0 → datamaestro-1.2.1}/.github/workflows/pytest.yml +0 -0
  9. {datamaestro-1.2.0 → datamaestro-1.2.1}/.github/workflows/python-publish.yml +0 -0
  10. {datamaestro-1.2.0 → datamaestro-1.2.1}/.gitignore +0 -0
  11. {datamaestro-1.2.0 → datamaestro-1.2.1}/.pre-commit-config.yaml +0 -0
  12. {datamaestro-1.2.0 → datamaestro-1.2.1}/.readthedocs.yml +0 -0
  13. {datamaestro-1.2.0 → datamaestro-1.2.1}/CHANGELOG.md +0 -0
  14. {datamaestro-1.2.0 → datamaestro-1.2.1}/LICENSE +0 -0
  15. {datamaestro-1.2.0 → datamaestro-1.2.1}/MANIFEST.in +0 -0
  16. {datamaestro-1.2.0 → datamaestro-1.2.1}/README.md +0 -0
  17. {datamaestro-1.2.0 → datamaestro-1.2.1}/TODO.md +0 -0
  18. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/Makefile +0 -0
  19. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/make.bat +0 -0
  20. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/requirements.txt +0 -0
  21. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/api/data.md +0 -0
  22. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/api/index.md +0 -0
  23. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/api/records.rst +0 -0
  24. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/conf.py +0 -0
  25. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/datasets.rst +0 -0
  26. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/developping.md +0 -0
  27. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/index.md +0 -0
  28. {datamaestro-1.2.0 → datamaestro-1.2.1}/docs/source/style.css +0 -0
  29. {datamaestro-1.2.0 → datamaestro-1.2.1}/mkdocs.yml +0 -0
  30. {datamaestro-1.2.0 → datamaestro-1.2.1}/pyproject.toml +0 -0
  31. {datamaestro-1.2.0 → datamaestro-1.2.1}/pytest.ini +0 -0
  32. {datamaestro-1.2.0 → datamaestro-1.2.1}/requirements-dev.txt +0 -0
  33. {datamaestro-1.2.0 → datamaestro-1.2.1}/requirements.txt +0 -0
  34. {datamaestro-1.2.0 → datamaestro-1.2.1}/schema.yaml +0 -0
  35. {datamaestro-1.2.0 → datamaestro-1.2.1}/setup.cfg +0 -0
  36. {datamaestro-1.2.0 → datamaestro-1.2.1}/setup.py +0 -0
  37. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/__init__.py +0 -0
  38. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/__main__.py +0 -0
  39. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/annotations/__init__.py +0 -0
  40. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/annotations/agreement.py +0 -0
  41. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/commands/__init__.py +0 -0
  42. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/commands/mainstyle.css +0 -0
  43. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/commands/site.py +0 -0
  44. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/context.py +0 -0
  45. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/data/__init__.py +0 -0
  46. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/data/csv.py +0 -0
  47. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/data/huggingface.py +0 -0
  48. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/data/ml.py +0 -0
  49. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/data/tensor.py +0 -0
  50. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/definitions.py +0 -0
  51. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/__init__.py +0 -0
  52. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/archive.py +0 -0
  53. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/huggingface.py +0 -0
  54. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/links.py +0 -0
  55. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/manual.py +0 -0
  56. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/multiple.py +0 -0
  57. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/single.py +0 -0
  58. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/sync.py +0 -0
  59. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/download/todo.py +0 -0
  60. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/record.py +0 -0
  61. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/registry.py +0 -0
  62. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/search.py +0 -0
  63. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/settings.py +0 -0
  64. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/sphinx.py +0 -0
  65. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/stream/__init__.py +0 -0
  66. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/stream/compress.py +0 -0
  67. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/stream/lines.py +0 -0
  68. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/templates/dataset.py +0 -0
  69. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/test/__init__.py +0 -0
  70. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/test/checks.py +0 -0
  71. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/test/conftest.py +0 -0
  72. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/test/test_annotations.py +0 -0
  73. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/test/test_download_handlers.py +0 -0
  74. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/test/test_record.py +0 -0
  75. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro/utils.py +0 -0
  76. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro.egg-info/dependency_links.txt +0 -0
  77. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro.egg-info/entry_points.txt +0 -0
  78. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro.egg-info/not-zip-safe +0 -0
  79. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro.egg-info/requires.txt +0 -0
  80. {datamaestro-1.2.0 → datamaestro-1.2.1}/src/datamaestro.egg-info/top_level.txt +0 -0
  81. {datamaestro-1.2.0 → datamaestro-1.2.1}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -40,6 +40,10 @@ Package `datamaestro.download.links`
40
40
  .. autofunction:: datamaestro.download.links.linkfile
41
41
 
42
42
 
43
+ Other
44
+ =====
45
+
46
+ .. autofunction:: datamaestro.download.wayback.wayback_documents
43
47
 
44
48
 
45
49
 
@@ -0,0 +1,163 @@
1
+ import logging
2
+ import json
3
+ from datamaestro.download import Resource
4
+ from typing import Callable, Iterator
5
+ from pathlib import Path
6
+ import requests
7
+ import random
8
+ import re
9
+ from requests.exceptions import HTTPError
10
+ from tqdm.auto import tqdm
11
+ import time
12
+ import urllib.parse
13
+ import uuid
14
+
15
+
16
+ wayback_prefix = re.compile(r"^https:\/\/web\.archive\.org\/web")
17
+ replace_pattern = re.compile(r"(web\.archive\.org\/web\/\d+)")
18
+
19
+
20
+ def download_with_retry(url: str, max_retries: int = 10) -> requests.Response:
21
+ """Download a URL with exponential backoff, until max_retries is reached."""
22
+ retry_num = 0
23
+ while True:
24
+ try:
25
+ response = requests.get(url)
26
+ response.raise_for_status()
27
+ return response
28
+ except HTTPError as e:
29
+ status_code = e.response.status_code
30
+ if not (status_code == 429 or status_code >= 500):
31
+ # This is not an error we should retry on
32
+ raise e
33
+
34
+ if retry_num > max_retries:
35
+ logging.error(
36
+ f"Failed to perform GET request on {url}"
37
+ f"after {max_retries} retries."
38
+ )
39
+ raise e
40
+
41
+ if status_code == 429:
42
+ time.sleep(5 + 2**retry_num + random.randint(0, 1000) / 1000)
43
+ else:
44
+ time.sleep(2**retry_num + random.randint(0, 1000) / 1000)
45
+ retry_num += 1
46
+
47
+
48
+ def download_link(link: str, timestamp: str):
49
+ page_id = str(uuid.uuid4())
50
+ url_no_header = None
51
+
52
+ try:
53
+ # Find the Wayback Machine link
54
+ if not wayback_prefix.match(link):
55
+ link_encoded = urllib.parse.quote(link)
56
+
57
+ available, availability_attempt = False, 0
58
+ # Sometimes the API returns HTTP success code 200, but archived
59
+ # snapshots shows page is unavailable when it actually is. Give it a
60
+ # total of three tries.
61
+ while not available and availability_attempt < 3:
62
+ response = download_with_retry(
63
+ "http://archive.org/wayback/available?"
64
+ f"url={link_encoded}&timestamp={timestamp}"
65
+ )
66
+ json_response = response.json()
67
+ available = "closest" in json_response["archived_snapshots"]
68
+ availability_attempt += 1
69
+
70
+ if not available:
71
+ logging.warning(
72
+ f"Not available on Wayback Machine: {link}, "
73
+ f"HTTP code {response.status_code}, {json_response}"
74
+ )
75
+ return {"link": link, "page_id": page_id, "available": False}
76
+
77
+ url = json_response["archived_snapshots"]["closest"]["url"]
78
+ else:
79
+ url = link
80
+
81
+ match = replace_pattern.search(url)
82
+ assert match
83
+ url_no_header = replace_pattern.sub(f"{match.group(1)}id_", url)
84
+
85
+ response = download_with_retry(url_no_header)
86
+ html_page = response.text
87
+
88
+ return {
89
+ "link": link,
90
+ "id": url_no_header,
91
+ "contents": html_page,
92
+ }
93
+
94
+ except HTTPError as http_err:
95
+ logging.warning(f"HTTP error occurred: {http_err} for {link}")
96
+ return {
97
+ "link": link,
98
+ "page_id": page_id,
99
+ "available": False,
100
+ "status_code": http_err.response.status_code if http_err.response else None,
101
+ "wayback_url": url_no_header,
102
+ }
103
+ except UnicodeDecodeError as e:
104
+ logging.warning(f"Unicode decode error occurred: {e} for {link}")
105
+ return {
106
+ "link": link,
107
+ "page_id": page_id,
108
+ "available": False,
109
+ "status_code": response.status_code,
110
+ "wayback_url": url_no_header,
111
+ }
112
+ except Exception as e:
113
+ logging.warning(f"Exception occurred: {e} for {link}")
114
+ return {
115
+ "link": link,
116
+ "page_id": page_id,
117
+ "available": False,
118
+ "status_code": None,
119
+ "wayback_url": url_no_header,
120
+ }
121
+
122
+
123
+ class wayback_documents(Resource):
124
+ """Collect documents from wayback"""
125
+
126
+ def __init__(self, timestamp: str, urls_fn: Callable[[], Iterator[str]], name=None):
127
+ super().__init__(name)
128
+ self.timestamp = timestamp
129
+ self.urls_fn = urls_fn
130
+
131
+ def prepare(self):
132
+ return self.definition.datapath / self.varname
133
+
134
+ def download(self, force=False):
135
+ # Creates directory if needed
136
+ destination: Path = self.definition.datapath / self.varname
137
+ self.definition.datapath.mkdir(exist_ok=True)
138
+
139
+ # Early exit
140
+ done_path = destination.with_suffix(".done")
141
+ if done_path.is_file() and not force:
142
+ return True
143
+
144
+ # Reads the URLs
145
+ logging.info("Retrieving URLs from wayback")
146
+ pos = 0
147
+ urls = set()
148
+ with destination.open("at+") as fp:
149
+ fp.seek(0)
150
+ try:
151
+ for line in fp:
152
+ pos = fp.tell()
153
+ urls.add(json.loads(line)["url"])
154
+ except json.JSONDecodeError:
155
+ logging.warning(f"JSON decoding error: getting back to position {pos}")
156
+ fp.seek(pos)
157
+
158
+ # Get the remaining ones
159
+ for url in tqdm(self.urls_fn()):
160
+ fp.write(json.dumps(download_link(url, self.timestamp)))
161
+
162
+ # Everything is fine
163
+ done_path.touch()
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '1.2.0'
16
- __version_tuple__ = version_tuple = (1, 2, 0)
15
+ __version__ = version = '1.2.1'
16
+ __version_tuple__ = version_tuple = (1, 2, 1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -67,6 +67,7 @@ src/datamaestro/download/multiple.py
67
67
  src/datamaestro/download/single.py
68
68
  src/datamaestro/download/sync.py
69
69
  src/datamaestro/download/todo.py
70
+ src/datamaestro/download/wayback.py
70
71
  src/datamaestro/stream/__init__.py
71
72
  src/datamaestro/stream/compress.py
72
73
  src/datamaestro/stream/lines.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes