hdx-python-scraper 2.3.8__py3-none-any.whl → 2.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdx/scraper/_version.py +2 -2
- hdx/scraper/configurable/rowparser.py +57 -22
- hdx/scraper/utilities/reader.py +9 -6
- {hdx_python_scraper-2.3.8.dist-info → hdx_python_scraper-2.4.0.dist-info}/METADATA +3 -3
- {hdx_python_scraper-2.3.8.dist-info → hdx_python_scraper-2.4.0.dist-info}/RECORD +7 -7
- {hdx_python_scraper-2.3.8.dist-info → hdx_python_scraper-2.4.0.dist-info}/WHEEL +1 -1
- {hdx_python_scraper-2.3.8.dist-info → hdx_python_scraper-2.4.0.dist-info}/licenses/LICENSE +0 -0
hdx/scraper/_version.py
CHANGED
|
@@ -185,20 +185,14 @@ class RowParser:
|
|
|
185
185
|
Returns:
|
|
186
186
|
Iterator[Dict]: Input data with prefilter applied if specified and sorted if specified or deemed necessary
|
|
187
187
|
"""
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
if self.
|
|
196
|
-
if all(
|
|
197
|
-
row[key] == value for key, value in self.stop_row.items()
|
|
198
|
-
):
|
|
199
|
-
break
|
|
200
|
-
for newrow in self.flatten(row):
|
|
201
|
-
rows.append(newrow)
|
|
188
|
+
if self.header_to_hxltag:
|
|
189
|
+
iterator = self.header_to_hxltag_rows(iterator)
|
|
190
|
+
if self.stop_row:
|
|
191
|
+
iterator = self.stop_rows(iterator)
|
|
192
|
+
if self.flatteninfo:
|
|
193
|
+
iterator = self.flatten_rows(iterator)
|
|
194
|
+
if self.prefilter:
|
|
195
|
+
iterator = (row for row in iterator if eval(self.prefilter))
|
|
202
196
|
if not self.sort:
|
|
203
197
|
if self.datecol:
|
|
204
198
|
for subset in self.subsets:
|
|
@@ -212,15 +206,59 @@ class RowParser:
|
|
|
212
206
|
)
|
|
213
207
|
self.sort = {"keys": [self.datecol], "reverse": True}
|
|
214
208
|
break
|
|
215
|
-
if self.prefilter:
|
|
216
|
-
rows = [row for row in rows if eval(self.prefilter)]
|
|
217
209
|
if self.sort:
|
|
218
210
|
keys = self.sort["keys"]
|
|
219
211
|
reverse = self.sort.get("reverse", False)
|
|
220
|
-
|
|
221
|
-
return
|
|
212
|
+
iterator = sorted(iterator, key=itemgetter(*keys), reverse=reverse)
|
|
213
|
+
return iterator
|
|
214
|
+
|
|
215
|
+
def header_to_hxltag_rows(
|
|
216
|
+
self, iterator: Iterator[Dict]
|
|
217
|
+
) -> Generator[Dict, None, None]:
|
|
218
|
+
"""Convert headers to HXL tags in keys
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
iterator (Iterator[Dict]): Input data
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Generator[Dict]: Rows where keys are HXL tags
|
|
225
|
+
"""
|
|
226
|
+
for row in iterator:
|
|
227
|
+
newrow = {}
|
|
228
|
+
for header in row:
|
|
229
|
+
newrow[self.header_to_hxltag[header]] = row[header]
|
|
230
|
+
yield newrow
|
|
231
|
+
|
|
232
|
+
def stop_rows(
|
|
233
|
+
self, iterator: Iterator[Dict]
|
|
234
|
+
) -> Generator[Dict, None, None]:
|
|
235
|
+
"""Stop processing rows after condition met
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
iterator (Iterator[Dict]): Input data
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Generator[Dict]: Rows up to stop condition
|
|
242
|
+
"""
|
|
243
|
+
for row in iterator:
|
|
244
|
+
if all(row[key] == value for key, value in self.stop_row.items()):
|
|
245
|
+
break
|
|
246
|
+
yield row
|
|
247
|
+
|
|
248
|
+
def flatten_rows(self, iterator: Iterator[Dict]) -> Iterator[Dict]:
|
|
249
|
+
"""Flatten rows
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
iterator (Iterator[Dict]): Input data
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Generator[Dict]: Flattened rows
|
|
256
|
+
"""
|
|
257
|
+
for row in iterator:
|
|
258
|
+
for newrow in self.flatten_row(row):
|
|
259
|
+
yield newrow
|
|
222
260
|
|
|
223
|
-
def
|
|
261
|
+
def flatten_row(self, row: Dict) -> Generator[Dict, None, None]:
|
|
224
262
|
"""Flatten a wide spreadsheet format into a long one
|
|
225
263
|
|
|
226
264
|
Args:
|
|
@@ -229,9 +267,6 @@ class RowParser:
|
|
|
229
267
|
Returns:
|
|
230
268
|
Generator[Dict]: Flattened row(s)
|
|
231
269
|
"""
|
|
232
|
-
if not self.flatteninfo:
|
|
233
|
-
yield row
|
|
234
|
-
return
|
|
235
270
|
counters = [-1 for _ in self.flatteninfo]
|
|
236
271
|
while True:
|
|
237
272
|
newrow = copy.deepcopy(row)
|
hdx/scraper/utilities/reader.py
CHANGED
|
@@ -206,15 +206,18 @@ class Read(Retrieve):
|
|
|
206
206
|
if headers is None:
|
|
207
207
|
headers = 1
|
|
208
208
|
datasetinfo["headers"] = 1
|
|
209
|
-
kwargs["headers"] = headers
|
|
210
|
-
if isinstance(headers, list):
|
|
211
|
-
kwargs["fill_merged_cells"] = True
|
|
212
209
|
format = datasetinfo["format"]
|
|
213
210
|
kwargs["format"] = format
|
|
214
|
-
if
|
|
215
|
-
|
|
211
|
+
if format in ("xls", "xlsx"):
|
|
212
|
+
if not sheet:
|
|
213
|
+
sheet = 1
|
|
214
|
+
if isinstance(headers, list):
|
|
215
|
+
kwargs["fill_merged_cells"] = True
|
|
216
|
+
elif "fill_merged_cells" not in kwargs:
|
|
217
|
+
kwargs["fill_merged_cells"] = False
|
|
216
218
|
if sheet:
|
|
217
219
|
kwargs["sheet"] = sheet
|
|
220
|
+
kwargs["headers"] = headers
|
|
218
221
|
compression = datasetinfo.get("compression")
|
|
219
222
|
if compression:
|
|
220
223
|
kwargs["compression"] = compression
|
|
@@ -302,7 +305,7 @@ class Read(Retrieve):
|
|
|
302
305
|
f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
|
|
303
306
|
)
|
|
304
307
|
datasets = []
|
|
305
|
-
for file_path in glob.glob(f"{saved_path}_*.json"):
|
|
308
|
+
for file_path in sorted(glob.glob(f"{saved_path}_*.json")):
|
|
306
309
|
datasets.append(Dataset.load_from_json(file_path))
|
|
307
310
|
else:
|
|
308
311
|
datasets = Dataset.search_in_hdx(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: hdx-python-scraper
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: HDX Python scraper utilities to assemble data from multiple sources
|
|
5
5
|
Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
|
|
6
6
|
Author-email: Michael Rans <rans@email.com>
|
|
@@ -27,8 +27,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
27
27
|
Requires-Python: >=3.8
|
|
28
28
|
Requires-Dist: gspread
|
|
29
29
|
Requires-Dist: hdx-python-api>=6.3.1
|
|
30
|
-
Requires-Dist: hdx-python-country>=3.7.
|
|
31
|
-
Requires-Dist: hdx-python-utilities>=3.7.
|
|
30
|
+
Requires-Dist: hdx-python-country>=3.7.6
|
|
31
|
+
Requires-Dist: hdx-python-utilities>=3.7.2
|
|
32
32
|
Requires-Dist: regex
|
|
33
33
|
Provides-Extra: dev
|
|
34
34
|
Requires-Dist: pre-commit; extra == 'dev'
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
|
|
2
|
-
hdx/scraper/_version.py,sha256=
|
|
2
|
+
hdx/scraper/_version.py,sha256=NXpAHvzuYHxlLDJV0489874frLu4dA2joFw1iHLLrOg,411
|
|
3
3
|
hdx/scraper/base_scraper.py,sha256=2eJifpb8G_KtEb9Z273suDCiMPteJsCBHwDEk3o0wA8,15433
|
|
4
4
|
hdx/scraper/runner.py,sha256=v5ToiTBOvFbkMOcBAoWGmDyO5bhGooTL8pPIt3BIQ8Y,53550
|
|
5
5
|
hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
|
|
7
7
|
hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
|
|
8
|
-
hdx/scraper/configurable/rowparser.py,sha256=
|
|
8
|
+
hdx/scraper/configurable/rowparser.py,sha256=bH05JUqViIVes9T7gWp0D2778BlFiJuNHmdovSFdFoI,15614
|
|
9
9
|
hdx/scraper/configurable/scraper.py,sha256=4f4kNbG0HCIfPe1ft93T247s841rk1fP4cIpkFQ6NWU,20594
|
|
10
10
|
hdx/scraper/configurable/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
|
|
11
11
|
hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
|
|
|
15
15
|
hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
|
|
16
16
|
hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
|
|
17
17
|
hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
|
|
18
|
-
hdx/scraper/utilities/reader.py,sha256=
|
|
18
|
+
hdx/scraper/utilities/reader.py,sha256=A8GeMAie9swvydouBeD3hPi2YuH8liBsLJsEiqUqqfw,26500
|
|
19
19
|
hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
|
|
20
20
|
hdx/scraper/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
|
|
21
21
|
hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
|
|
22
|
-
hdx_python_scraper-2.
|
|
23
|
-
hdx_python_scraper-2.
|
|
24
|
-
hdx_python_scraper-2.
|
|
25
|
-
hdx_python_scraper-2.
|
|
22
|
+
hdx_python_scraper-2.4.0.dist-info/METADATA,sha256=TZmLTpIowY1ta86DPlupiocA_uLH4Vda7uANNh9j38E,3361
|
|
23
|
+
hdx_python_scraper-2.4.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
24
|
+
hdx_python_scraper-2.4.0.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
|
|
25
|
+
hdx_python_scraper-2.4.0.dist-info/RECORD,,
|
|
File without changes
|