hdx-python-scraper 2.3.7__py3-none-any.whl → 2.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hdx/scraper/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2.3.7'
16
- __version_tuple__ = version_tuple = (2, 3, 7)
15
+ __version__ = version = '2.3.9'
16
+ __version_tuple__ = version_tuple = (2, 3, 9)
@@ -185,20 +185,14 @@ class RowParser:
185
185
  Returns:
186
186
  Iterator[Dict]: Input data with prefilter applied if specified and sorted if specified or deemed necessary
187
187
  """
188
- rows = []
189
- for row in iterator:
190
- if self.header_to_hxltag:
191
- newrow = {}
192
- for header in row:
193
- newrow[self.header_to_hxltag[header]] = row[header]
194
- row = newrow
195
- if self.stop_row:
196
- if all(
197
- row[key] == value for key, value in self.stop_row.items()
198
- ):
199
- break
200
- for newrow in self.flatten(row):
201
- rows.append(newrow)
188
+ if self.header_to_hxltag:
189
+ iterator = self.header_to_hxltag_rows(iterator)
190
+ if self.stop_row:
191
+ iterator = self.stop_rows(iterator)
192
+ if self.flatteninfo:
193
+ iterator = self.flatten_rows(iterator)
194
+ if self.prefilter:
195
+ iterator = (row for row in iterator if eval(self.prefilter))
202
196
  if not self.sort:
203
197
  if self.datecol:
204
198
  for subset in self.subsets:
@@ -212,15 +206,59 @@ class RowParser:
212
206
  )
213
207
  self.sort = {"keys": [self.datecol], "reverse": True}
214
208
  break
215
- if self.prefilter:
216
- rows = [row for row in rows if eval(self.prefilter)]
217
209
  if self.sort:
218
210
  keys = self.sort["keys"]
219
211
  reverse = self.sort.get("reverse", False)
220
- rows = sorted(rows, key=itemgetter(*keys), reverse=reverse)
221
- return rows
212
+ iterator = sorted(iterator, key=itemgetter(*keys), reverse=reverse)
213
+ return iterator
214
+
215
+ def header_to_hxltag_rows(
216
+ self, iterator: Iterator[Dict]
217
+ ) -> Generator[Dict, None, None]:
218
+ """Convert headers to HXL tags in keys
219
+
220
+ Args:
221
+ iterator (Iterator[Dict]): Input data
222
+
223
+ Returns:
224
+ Generator[Dict]: Rows where keys are HXL tags
225
+ """
226
+ for row in iterator:
227
+ newrow = {}
228
+ for header in row:
229
+ newrow[self.header_to_hxltag[header]] = row[header]
230
+ yield newrow
231
+
232
+ def stop_rows(
233
+ self, iterator: Iterator[Dict]
234
+ ) -> Generator[Dict, None, None]:
235
+ """Stop processing rows after condition met
236
+
237
+ Args:
238
+ iterator (Iterator[Dict]): Input data
239
+
240
+ Returns:
241
+ Generator[Dict]: Rows up to stop condition
242
+ """
243
+ for row in iterator:
244
+ if all(row[key] == value for key, value in self.stop_row.items()):
245
+ break
246
+ yield row
247
+
248
+ def flatten_rows(self, iterator: Iterator[Dict]) -> Iterator[Dict]:
249
+ """Flatten rows
250
+
251
+ Args:
252
+ iterator (Iterator[Dict]): Input data
253
+
254
+ Returns:
255
+ Generator[Dict]: Flattened rows
256
+ """
257
+ for row in iterator:
258
+ for newrow in self.flatten_row(row):
259
+ yield newrow
222
260
 
223
- def flatten(self, row: Dict) -> Generator[Dict, None, None]:
261
+ def flatten_row(self, row: Dict) -> Generator[Dict, None, None]:
224
262
  """Flatten a wide spreadsheet format into a long one
225
263
 
226
264
  Args:
@@ -229,9 +267,6 @@ class RowParser:
229
267
  Returns:
230
268
  Generator[Dict]: Flattened row(s)
231
269
  """
232
- if not self.flatteninfo:
233
- yield row
234
- return
235
270
  counters = [-1 for _ in self.flatteninfo]
236
271
  while True:
237
272
  newrow = copy.deepcopy(row)
@@ -314,7 +349,7 @@ class RowParser:
314
349
  adms[i], exact = Country.get_iso3_country_code_fuzzy(adm)
315
350
  elif i == 1:
316
351
  adms[i], exact = self.adminlevel.get_pcode(
317
- adms[0], adm, self.name
352
+ adms[0], adm, logname=self.name
318
353
  )
319
354
  if adms[i] not in self.adms[i]:
320
355
  adms[i] = None
@@ -302,7 +302,7 @@ class Read(Retrieve):
302
302
  f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
303
303
  )
304
304
  datasets = []
305
- for file_path in glob.glob(f"{saved_path}_*.json"):
305
+ for file_path in sorted(glob.glob(f"{saved_path}_*.json")):
306
306
  datasets.append(Dataset.load_from_json(file_path))
307
307
  else:
308
308
  datasets = Dataset.search_in_hdx(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: hdx-python-scraper
3
- Version: 2.3.7
3
+ Version: 2.3.9
4
4
  Summary: HDX Python scraper utilities to assemble data from multiple sources
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -26,14 +26,14 @@ Classifier: Programming Language :: Python :: 3.12
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Requires-Python: >=3.8
28
28
  Requires-Dist: gspread
29
- Requires-Dist: hdx-python-api>=6.2.8
30
- Requires-Dist: hdx-python-country>=3.7.0
31
- Requires-Dist: hdx-python-utilities>=3.6.8
29
+ Requires-Dist: hdx-python-api>=6.3.1
30
+ Requires-Dist: hdx-python-country>=3.7.6
31
+ Requires-Dist: hdx-python-utilities>=3.7.2
32
32
  Requires-Dist: regex
33
33
  Provides-Extra: dev
34
34
  Requires-Dist: pre-commit; extra == 'dev'
35
35
  Provides-Extra: pandas
36
- Requires-Dist: pandas>=2.1.3; extra == 'pandas'
36
+ Requires-Dist: pandas>=2.2.2; extra == 'pandas'
37
37
  Provides-Extra: test
38
38
  Requires-Dist: pytest; extra == 'test'
39
39
  Requires-Dist: pytest-cov; extra == 'test'
@@ -1,11 +1,11 @@
1
1
  hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
- hdx/scraper/_version.py,sha256=SH_yCAX65tCK8PRP8gyPvUcp4HPVksM4fKEz1rXjzjM,411
2
+ hdx/scraper/_version.py,sha256=lMqlPiUx9lKEMHVUT48lKzJtwunAe5HDUG5UDVJpmdc,411
3
3
  hdx/scraper/base_scraper.py,sha256=2eJifpb8G_KtEb9Z273suDCiMPteJsCBHwDEk3o0wA8,15433
4
4
  hdx/scraper/runner.py,sha256=v5ToiTBOvFbkMOcBAoWGmDyO5bhGooTL8pPIt3BIQ8Y,53550
5
5
  hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
7
7
  hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
8
- hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
8
+ hdx/scraper/configurable/rowparser.py,sha256=bH05JUqViIVes9T7gWp0D2778BlFiJuNHmdovSFdFoI,15614
9
9
  hdx/scraper/configurable/scraper.py,sha256=4f4kNbG0HCIfPe1ft93T247s841rk1fP4cIpkFQ6NWU,20594
10
10
  hdx/scraper/configurable/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
11
11
  hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
15
15
  hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
16
16
  hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
17
17
  hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
- hdx/scraper/utilities/reader.py,sha256=03S53U1GylPaeRoqEj3TT5UgiKTwVODUx3IETwCb9ps,26364
18
+ hdx/scraper/utilities/reader.py,sha256=TIkQ9UGlOVvtUx0JRF83PrrgWNk5IwPfrzTqlITuGLM,26372
19
19
  hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
20
  hdx/scraper/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
21
21
  hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
22
- hdx_python_scraper-2.3.7.dist-info/METADATA,sha256=Nw-xgPumG7UzJw3M1D5G9kZeUgZObM3m8mkkA1kutqg,3361
23
- hdx_python_scraper-2.3.7.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
24
- hdx_python_scraper-2.3.7.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
- hdx_python_scraper-2.3.7.dist-info/RECORD,,
22
+ hdx_python_scraper-2.3.9.dist-info/METADATA,sha256=CQgUoTo83s7fqbcvG1XwDm0T8v0U_S3m6Me3bHuXElE,3361
23
+ hdx_python_scraper-2.3.9.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
24
+ hdx_python_scraper-2.3.9.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
+ hdx_python_scraper-2.3.9.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any