hdx-python-scraper 2.3.8__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hdx/scraper/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2.3.8'
16
- __version_tuple__ = version_tuple = (2, 3, 8)
15
+ __version__ = version = '2.4.0'
16
+ __version_tuple__ = version_tuple = (2, 4, 0)
@@ -185,20 +185,14 @@ class RowParser:
185
185
  Returns:
186
186
  Iterator[Dict]: Input data with prefilter applied if specified and sorted if specified or deemed necessary
187
187
  """
188
- rows = []
189
- for row in iterator:
190
- if self.header_to_hxltag:
191
- newrow = {}
192
- for header in row:
193
- newrow[self.header_to_hxltag[header]] = row[header]
194
- row = newrow
195
- if self.stop_row:
196
- if all(
197
- row[key] == value for key, value in self.stop_row.items()
198
- ):
199
- break
200
- for newrow in self.flatten(row):
201
- rows.append(newrow)
188
+ if self.header_to_hxltag:
189
+ iterator = self.header_to_hxltag_rows(iterator)
190
+ if self.stop_row:
191
+ iterator = self.stop_rows(iterator)
192
+ if self.flatteninfo:
193
+ iterator = self.flatten_rows(iterator)
194
+ if self.prefilter:
195
+ iterator = (row for row in iterator if eval(self.prefilter))
202
196
  if not self.sort:
203
197
  if self.datecol:
204
198
  for subset in self.subsets:
@@ -212,15 +206,59 @@ class RowParser:
212
206
  )
213
207
  self.sort = {"keys": [self.datecol], "reverse": True}
214
208
  break
215
- if self.prefilter:
216
- rows = [row for row in rows if eval(self.prefilter)]
217
209
  if self.sort:
218
210
  keys = self.sort["keys"]
219
211
  reverse = self.sort.get("reverse", False)
220
- rows = sorted(rows, key=itemgetter(*keys), reverse=reverse)
221
- return rows
212
+ iterator = sorted(iterator, key=itemgetter(*keys), reverse=reverse)
213
+ return iterator
214
+
215
+ def header_to_hxltag_rows(
216
+ self, iterator: Iterator[Dict]
217
+ ) -> Generator[Dict, None, None]:
218
+ """Convert headers to HXL tags in keys
219
+
220
+ Args:
221
+ iterator (Iterator[Dict]): Input data
222
+
223
+ Returns:
224
+ Generator[Dict]: Rows where keys are HXL tags
225
+ """
226
+ for row in iterator:
227
+ newrow = {}
228
+ for header in row:
229
+ newrow[self.header_to_hxltag[header]] = row[header]
230
+ yield newrow
231
+
232
+ def stop_rows(
233
+ self, iterator: Iterator[Dict]
234
+ ) -> Generator[Dict, None, None]:
235
+ """Stop processing rows after condition met
236
+
237
+ Args:
238
+ iterator (Iterator[Dict]): Input data
239
+
240
+ Returns:
241
+ Generator[Dict]: Rows up to stop condition
242
+ """
243
+ for row in iterator:
244
+ if all(row[key] == value for key, value in self.stop_row.items()):
245
+ break
246
+ yield row
247
+
248
+ def flatten_rows(self, iterator: Iterator[Dict]) -> Iterator[Dict]:
249
+ """Flatten rows
250
+
251
+ Args:
252
+ iterator (Iterator[Dict]): Input data
253
+
254
+ Returns:
255
+ Generator[Dict]: Flattened rows
256
+ """
257
+ for row in iterator:
258
+ for newrow in self.flatten_row(row):
259
+ yield newrow
222
260
 
223
- def flatten(self, row: Dict) -> Generator[Dict, None, None]:
261
+ def flatten_row(self, row: Dict) -> Generator[Dict, None, None]:
224
262
  """Flatten a wide spreadsheet format into a long one
225
263
 
226
264
  Args:
@@ -229,9 +267,6 @@ class RowParser:
229
267
  Returns:
230
268
  Generator[Dict]: Flattened row(s)
231
269
  """
232
- if not self.flatteninfo:
233
- yield row
234
- return
235
270
  counters = [-1 for _ in self.flatteninfo]
236
271
  while True:
237
272
  newrow = copy.deepcopy(row)
@@ -206,15 +206,18 @@ class Read(Retrieve):
206
206
  if headers is None:
207
207
  headers = 1
208
208
  datasetinfo["headers"] = 1
209
- kwargs["headers"] = headers
210
- if isinstance(headers, list):
211
- kwargs["fill_merged_cells"] = True
212
209
  format = datasetinfo["format"]
213
210
  kwargs["format"] = format
214
- if not sheet and format in ("xls", "xlsx"):
215
- sheet = 1
211
+ if format in ("xls", "xlsx"):
212
+ if not sheet:
213
+ sheet = 1
214
+ if isinstance(headers, list):
215
+ kwargs["fill_merged_cells"] = True
216
+ elif "fill_merged_cells" not in kwargs:
217
+ kwargs["fill_merged_cells"] = False
216
218
  if sheet:
217
219
  kwargs["sheet"] = sheet
220
+ kwargs["headers"] = headers
218
221
  compression = datasetinfo.get("compression")
219
222
  if compression:
220
223
  kwargs["compression"] = compression
@@ -302,7 +305,7 @@ class Read(Retrieve):
302
305
  f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
303
306
  )
304
307
  datasets = []
305
- for file_path in glob.glob(f"{saved_path}_*.json"):
308
+ for file_path in sorted(glob.glob(f"{saved_path}_*.json")):
306
309
  datasets.append(Dataset.load_from_json(file_path))
307
310
  else:
308
311
  datasets = Dataset.search_in_hdx(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: hdx-python-scraper
3
- Version: 2.3.8
3
+ Version: 2.4.0
4
4
  Summary: HDX Python scraper utilities to assemble data from multiple sources
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -27,8 +27,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Requires-Python: >=3.8
28
28
  Requires-Dist: gspread
29
29
  Requires-Dist: hdx-python-api>=6.3.1
30
- Requires-Dist: hdx-python-country>=3.7.2
31
- Requires-Dist: hdx-python-utilities>=3.7.0
30
+ Requires-Dist: hdx-python-country>=3.7.6
31
+ Requires-Dist: hdx-python-utilities>=3.7.2
32
32
  Requires-Dist: regex
33
33
  Provides-Extra: dev
34
34
  Requires-Dist: pre-commit; extra == 'dev'
@@ -1,11 +1,11 @@
1
1
  hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
- hdx/scraper/_version.py,sha256=F6Kqj0YPILNxwibWkU1tRl9QTGLZQ0ppBHxok1TGI5I,411
2
+ hdx/scraper/_version.py,sha256=NXpAHvzuYHxlLDJV0489874frLu4dA2joFw1iHLLrOg,411
3
3
  hdx/scraper/base_scraper.py,sha256=2eJifpb8G_KtEb9Z273suDCiMPteJsCBHwDEk3o0wA8,15433
4
4
  hdx/scraper/runner.py,sha256=v5ToiTBOvFbkMOcBAoWGmDyO5bhGooTL8pPIt3BIQ8Y,53550
5
5
  hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
7
7
  hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
8
- hdx/scraper/configurable/rowparser.py,sha256=GS2KYn3Q-r9OOd_PIs5ebA_30pSkwBOju4ZXiAEOqnU,14643
8
+ hdx/scraper/configurable/rowparser.py,sha256=bH05JUqViIVes9T7gWp0D2778BlFiJuNHmdovSFdFoI,15614
9
9
  hdx/scraper/configurable/scraper.py,sha256=4f4kNbG0HCIfPe1ft93T247s841rk1fP4cIpkFQ6NWU,20594
10
10
  hdx/scraper/configurable/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
11
11
  hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
15
15
  hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
16
16
  hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
17
17
  hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
- hdx/scraper/utilities/reader.py,sha256=03S53U1GylPaeRoqEj3TT5UgiKTwVODUx3IETwCb9ps,26364
18
+ hdx/scraper/utilities/reader.py,sha256=A8GeMAie9swvydouBeD3hPi2YuH8liBsLJsEiqUqqfw,26500
19
19
  hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
20
  hdx/scraper/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
21
21
  hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
22
- hdx_python_scraper-2.3.8.dist-info/METADATA,sha256=4AHZxyFFH8srfK4eIRZ3FUhxp5zG-nBy5ucPaFnFQiQ,3361
23
- hdx_python_scraper-2.3.8.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
24
- hdx_python_scraper-2.3.8.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
- hdx_python_scraper-2.3.8.dist-info/RECORD,,
22
+ hdx_python_scraper-2.4.0.dist-info/METADATA,sha256=TZmLTpIowY1ta86DPlupiocA_uLH4Vda7uANNh9j38E,3361
23
+ hdx_python_scraper-2.4.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
24
+ hdx_python_scraper-2.4.0.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
+ hdx_python_scraper-2.4.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any