hdx-python-scraper 2.6.2__py3-none-any.whl → 2.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,13 @@
1
- # file generated by setuptools_scm
1
+ # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
+
4
+ __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
5
+
3
6
  TYPE_CHECKING = False
4
7
  if TYPE_CHECKING:
5
- from typing import Tuple, Union
8
+ from typing import Tuple
9
+ from typing import Union
10
+
6
11
  VERSION_TUPLE = Tuple[Union[int, str], ...]
7
12
  else:
8
13
  VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
12
17
  __version_tuple__: VERSION_TUPLE
13
18
  version_tuple: VERSION_TUPLE
14
19
 
15
- __version__ = version = '2.6.2'
16
- __version_tuple__ = version_tuple = (2, 6, 2)
20
+ __version__ = version = '2.6.4'
21
+ __version_tuple__ = version_tuple = (2, 6, 4)
@@ -79,8 +79,7 @@ class BaseScraper(ABC):
79
79
  None
80
80
  """
81
81
  self.values: Dict[str, Tuple] = {
82
- level: tuple({} for _ in value[0])
83
- for level, value in self.headers.items()
82
+ level: tuple({} for _ in value[0]) for level, value in self.headers.items()
84
83
  }
85
84
  self.sources: Dict[str, List] = {level: [] for level in self.headers}
86
85
  self.source_configuration = deepcopy(source_configuration)
@@ -137,9 +136,7 @@ class BaseScraper(ABC):
137
136
  return
138
137
  if self.datasetinfo.get("no_sources", False):
139
138
  return
140
- should_overwrite_sources = self.datasetinfo.get(
141
- "should_overwrite_sources"
142
- )
139
+ should_overwrite_sources = self.datasetinfo.get("should_overwrite_sources")
143
140
  if should_overwrite_sources is not None:
144
141
  self.source_configuration["should_overwrite_sources"] = (
145
142
  should_overwrite_sources
@@ -209,16 +206,12 @@ class BaseScraper(ABC):
209
206
  )
210
207
 
211
208
  for i, hxltag in enumerate(self.headers[level][1]):
212
- suffix_attribute = self.source_configuration.get(
213
- "suffix_attribute"
214
- )
209
+ suffix_attribute = self.source_configuration.get("suffix_attribute")
215
210
  if suffix_attribute:
216
211
  add_source(hxltag, suffix_attribute)
217
212
  continue
218
213
  values = self.get_values(level)[i]
219
- admin_sources = self.source_configuration.get(
220
- "admin_sources", False
221
- )
214
+ admin_sources = self.source_configuration.get("admin_sources", False)
222
215
  if not admin_sources:
223
216
  raise ValueError("Invalid source configuration!")
224
217
  admin_mapping = self.source_configuration.get("admin_mapping")
@@ -260,9 +253,7 @@ class BaseScraper(ABC):
260
253
  """
261
254
  if datasetinfo is None:
262
255
  datasetinfo = self.datasetinfo
263
- date = Sources.get_hxltag_source_date(
264
- datasetinfo, hxltag, fallback=True
265
- )
256
+ date = Sources.get_hxltag_source_date(datasetinfo, hxltag, fallback=True)
266
257
  if key is None:
267
258
  key = self.name
268
259
  dict_of_lists_add(
@@ -80,9 +80,7 @@ class GoogleSheets(BaseOutput):
80
80
  headers = list(values.columns.values)
81
81
  rows = [headers]
82
82
  if hxltags:
83
- rows.append(
84
- [hxltags.get(header, "") for header in headers]
85
- )
83
+ rows.append([hxltags.get(header, "") for header in headers])
86
84
  if limit is not None:
87
85
  values = values.head(limit)
88
86
  df = values.copy(deep=True)
@@ -219,9 +219,7 @@ class JsonFile(BaseOutput):
219
219
  newjson = self.json.get(key)
220
220
  filters = tabdetails.get("filters", {})
221
221
  hxltags = tabdetails.get("output")
222
- if (filters or hxltags or remove) and isinstance(
223
- newjson, list
224
- ):
222
+ if (filters or hxltags or remove) and isinstance(newjson, list):
225
223
  rows = []
226
224
  for row in newjson:
227
225
  ignore_row = False
@@ -48,9 +48,7 @@ class Runner:
48
48
  self.scrapers = {}
49
49
  self.scraper_names = []
50
50
 
51
- def add_custom(
52
- self, scraper: BaseScraper, force_add_to_run: bool = False
53
- ) -> str:
51
+ def add_custom(self, scraper: BaseScraper, force_add_to_run: bool = False) -> str:
54
52
  """Add custom scrapers that inherit BaseScraper. If running specific scrapers
55
53
  rather than all, and you want to force the inclusion of the scraper in the run
56
54
  regardless of the specific scrapers given, the parameter force_add_to_run
@@ -285,9 +283,7 @@ class Runner:
285
283
  Returns:
286
284
  Optional["Aggregator"]: scraper or None
287
285
  """
288
- input_headers = self.get_headers(
289
- names, [input_level], overrides=overrides
290
- )
286
+ input_headers = self.get_headers(names, [input_level], overrides=overrides)
291
287
  input_headers = input_headers.get(input_level)
292
288
  if not input_headers:
293
289
  return None
@@ -312,9 +308,7 @@ class Runner:
312
308
  ) = self.get_values_sourcesinfo_by_header(
313
309
  input_level, names, overrides, True, use_hxl
314
310
  )
315
- scraper_self.set_input_values_sources(
316
- input_values, input_sourcesinfo
317
- )
311
+ scraper_self.set_input_values_sources(input_values, input_sourcesinfo)
318
312
 
319
313
  scraper.pre_run = lambda: get_values_sourcesinfo_by_header(scraper)
320
314
  return scraper
@@ -469,9 +463,7 @@ class Runner:
469
463
  keys = []
470
464
  for datasetinfo in configuration:
471
465
  keys.append(
472
- self.add_resource_downloader(
473
- datasetinfo, folder, force_add_to_run
474
- )
466
+ self.add_resource_downloader(datasetinfo, folder, force_add_to_run)
475
467
  )
476
468
  return keys
477
469
 
@@ -552,9 +544,7 @@ class Runner:
552
544
  for key, value in kwargs.items():
553
545
  setattr(scraper, key, value)
554
546
 
555
- def add_pre_run(
556
- self, name: str, fn: Callable[[BaseScraper], None]
557
- ) -> None:
547
+ def add_pre_run(self, name: str, fn: Callable[[BaseScraper], None]) -> None:
558
548
  """Add pre run instance method to scraper instance given scraper name. The
559
549
  function should have one parameter. Since it is being added as an instance
560
550
  method to the scraper instance, that parameter will be self and hence is of
@@ -570,9 +560,7 @@ class Runner:
570
560
  scraper = self.get_scraper_exception(name)
571
561
  scraper.pre_run = lambda: fn(scraper)
572
562
 
573
- def add_post_run(
574
- self, name: str, fn: Callable[[BaseScraper], None]
575
- ) -> None:
563
+ def add_post_run(self, name: str, fn: Callable[[BaseScraper], None]) -> None:
576
564
  """Add post run instance method to scraper instance given scraper name. The
577
565
  function should have one parameter. Since it is being added as an instance
578
566
  method to the scraper instance, that parameter will be self and hence is of
@@ -617,9 +605,7 @@ class Runner:
617
605
  f"Using fallbacks for {scraper.name}! Error: {format_exc()}"
618
606
  )
619
607
  for level in scraper.headers.keys():
620
- values, sources = Fallbacks.get(
621
- level, scraper.headers[level]
622
- )
608
+ values, sources = Fallbacks.get(level, scraper.headers[level])
623
609
  scraper.values[level] = values
624
610
  scraper.sources[level] = sources
625
611
  scraper.add_population()
@@ -643,9 +629,7 @@ class Runner:
643
629
  Returns:
644
630
  bool: Return True if scraper was run, False if not
645
631
  """
646
- if self.scrapers_to_run and not any(
647
- x in name for x in self.scrapers_to_run
648
- ):
632
+ if self.scrapers_to_run and not any(x in name for x in self.scrapers_to_run):
649
633
  return False
650
634
  logger.info(f"Running {name}")
651
635
  return self.run_one(name, force_run)
@@ -728,9 +712,7 @@ class Runner:
728
712
  names = self.scrapers.keys()
729
713
  results = {}
730
714
 
731
- def add_level_results(
732
- scraper_level, override_level, scrap, levels_used
733
- ):
715
+ def add_level_results(scraper_level, override_level, scrap, levels_used):
734
716
  nonlocal results
735
717
 
736
718
  if scraper_level in levels_used:
@@ -802,9 +784,7 @@ class Runner:
802
784
  names = self.scrapers.keys()
803
785
  results = {}
804
786
 
805
- def add_level_results(
806
- scraper_level, override_level, scrap, levels_used
807
- ):
787
+ def add_level_results(scraper_level, override_level, scrap, levels_used):
808
788
  nonlocal results
809
789
 
810
790
  if scraper_level in levels_used:
@@ -830,10 +810,8 @@ class Runner:
830
810
  lev_headings = level_results["headers"][0]
831
811
  lev_hxltags = level_results["headers"][1]
832
812
  lev_values = level_results["values"]
833
- scraper_should_overwrite_sources = (
834
- scraper.source_configuration.get(
835
- "should_overwrite_sources", should_overwrite_sources
836
- )
813
+ scraper_should_overwrite_sources = scraper.source_configuration.get(
814
+ "should_overwrite_sources", should_overwrite_sources
837
815
  )
838
816
  for i, hxltag in enumerate(hxltags):
839
817
  if hxltag in lev_hxltags:
@@ -903,9 +881,7 @@ class Runner:
903
881
  Returns:
904
882
  List[List]: Rows for a given level
905
883
  """
906
- results = self.get_results(names, [level], overrides=overrides).get(
907
- level
908
- )
884
+ results = self.get_results(names, [level], overrides=overrides).get(level)
909
885
  rows = []
910
886
  if results:
911
887
  all_headers = results["headers"]
@@ -956,9 +932,7 @@ class Runner:
956
932
  else:
957
933
  main_index = 0
958
934
 
959
- def add_level_results(
960
- scraper_level, override_level, scrap, levels_used
961
- ):
935
+ def add_level_results(scraper_level, override_level, scrap, levels_used):
962
936
  nonlocal values, sourcesinfo
963
937
 
964
938
  if scraper_level in levels_used:
@@ -1129,10 +1103,8 @@ class Runner:
1129
1103
  levels_to_check = levels
1130
1104
  else:
1131
1105
  levels_to_check = scraper.sources.keys()
1132
- scraper_should_overwrite_sources = (
1133
- scraper.source_configuration.get(
1134
- "should_overwrite_sources", should_overwrite_sources
1135
- )
1106
+ scraper_should_overwrite_sources = scraper.source_configuration.get(
1107
+ "should_overwrite_sources", should_overwrite_sources
1136
1108
  )
1137
1109
  for level in levels_to_check:
1138
1110
  Sources.add_sources_overwrite(
@@ -1145,9 +1117,7 @@ class Runner:
1145
1117
  add_additional_sources()
1146
1118
  return sources
1147
1119
 
1148
- def get_source_urls(
1149
- self, names: Optional[ListTuple[str]] = None
1150
- ) -> List[str]:
1120
+ def get_source_urls(self, names: Optional[ListTuple[str]] = None) -> List[str]:
1151
1121
  """Get source urls for scrapers limiting to those in names if given.
1152
1122
 
1153
1123
  Args:
@@ -1260,9 +1230,7 @@ class Runner:
1260
1230
  if not hapi_resource_metadata:
1261
1231
  return
1262
1232
  dataset_id = hapi_dataset_metadata["hdx_id"]
1263
- hapi_metadata = hapi_results.get(
1264
- dataset_id, copy(hapi_dataset_metadata)
1265
- )
1233
+ hapi_metadata = hapi_results.get(dataset_id, copy(hapi_dataset_metadata))
1266
1234
  results = hapi_metadata.get("results", {})
1267
1235
  level_results = results.get(scraper_level)
1268
1236
  if level_results is None:
@@ -115,9 +115,7 @@ class Aggregator(BaseScraper):
115
115
  config_headers_or_hxltags = datasetinfo.get("input")
116
116
  if config_headers_or_hxltags:
117
117
  exists = True
118
- for i, config_header_or_hxltag in enumerate(
119
- config_headers_or_hxltags
120
- ):
118
+ for i, config_header_or_hxltag in enumerate(config_headers_or_hxltags):
121
119
  try:
122
120
  input_headers[main_index].index(config_header_or_hxltag)
123
121
  except ValueError:
@@ -218,9 +216,7 @@ class Aggregator(BaseScraper):
218
216
  novals = 0
219
217
  for valuestr in valuelist:
220
218
  value = ""
221
- if isinstance(valuestr, int) or isinstance(
222
- valuestr, float
223
- ):
219
+ if isinstance(valuestr, int) or isinstance(valuestr, float):
224
220
  value = valuestr
225
221
  else:
226
222
  if valuestr:
@@ -359,9 +355,7 @@ class Aggregator(BaseScraper):
359
355
  if "source" not in self.datasetinfo:
360
356
  self.datasetinfo["source"] = ",".join(sourceinfo["source"])
361
357
  if "source_url" not in self.datasetinfo:
362
- self.datasetinfo["source_url"] = ",".join(
363
- sourceinfo["source_url"]
364
- )
358
+ self.datasetinfo["source_url"] = ",".join(sourceinfo["source_url"])
365
359
  if "source" not in self.datasetinfo:
366
360
  return
367
361
  super().add_sources()
@@ -129,9 +129,7 @@ class ConfigurableScraper(BaseScraper):
129
129
  "input_keep": datasetinfo.get("input_keep", []),
130
130
  "input_append": datasetinfo.get("input_append", []),
131
131
  "sum": datasetinfo.get("sum"),
132
- "input_ignore_vals": datasetinfo.get(
133
- "input_ignore_vals", []
134
- ),
132
+ "input_ignore_vals": datasetinfo.get("input_ignore_vals", []),
135
133
  "output": datasetinfo.get("output", []),
136
134
  "output_hxl": datasetinfo.get("output_hxl", []),
137
135
  }
@@ -144,10 +142,7 @@ class ConfigurableScraper(BaseScraper):
144
142
  Returns:
145
143
  Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
146
144
  """
147
- if (
148
- "filename" not in self.datasetinfo
149
- and "file_prefix" not in self.datasetinfo
150
- ):
145
+ if "filename" not in self.datasetinfo and "file_prefix" not in self.datasetinfo:
151
146
  self.datasetinfo["file_prefix"] = self.name
152
147
  return self.get_reader().read(self.datasetinfo, **self.variables)
153
148
 
@@ -162,9 +157,7 @@ class ConfigurableScraper(BaseScraper):
162
157
  if not date or use_date_from_date_col:
163
158
  date = self.rowparser.get_maxdate()
164
159
  if date == 0:
165
- raise ValueError(
166
- "No date given in datasetinfo or as a column!"
167
- )
160
+ raise ValueError("No date given in datasetinfo or as a column!")
168
161
  if self.rowparser.datetype == "date":
169
162
  if not isinstance(date, datetime):
170
163
  date = parse_date(date)
@@ -361,11 +354,7 @@ class ConfigurableScraper(BaseScraper):
361
354
  else:
362
355
  input_keep_index = -1
363
356
  val = valdicts[j][adm][input_keep_index]
364
- if (
365
- val is None
366
- or val == ""
367
- or val in input_ignore_vals
368
- ):
357
+ if val is None or val == "" or val in input_ignore_vals:
369
358
  val = 0
370
359
  else:
371
360
  hasvalues = True
@@ -384,16 +373,12 @@ class ConfigurableScraper(BaseScraper):
384
373
  for bracketed_str in matches.captures("rec"):
385
374
  if any(bracketed_str in x for x in valcols):
386
375
  continue
387
- _, hasvalues_t = text_replacement(
388
- bracketed_str, adm
389
- )
376
+ _, hasvalues_t = text_replacement(bracketed_str, adm)
390
377
  if not hasvalues_t:
391
378
  hasvalues = False
392
379
  break
393
380
  if hasvalues:
394
- formula, hasvalues_t = text_replacement(
395
- process_col, adm
396
- )
381
+ formula, hasvalues_t = text_replacement(process_col, adm)
397
382
  if hasvalues_t:
398
383
  formula = formula.replace(
399
384
  "#population",
@@ -431,20 +416,14 @@ class ConfigurableScraper(BaseScraper):
431
416
  continue
432
417
  for j, valdict in enumerate(valdicts):
433
418
  val = valdict[adm][i]
434
- if (
435
- val is None
436
- or val == ""
437
- or val in input_ignore_vals
438
- ):
419
+ if val is None or val == "" or val in input_ignore_vals:
439
420
  continue
440
421
  newvaldicts[j][adm] = eval(
441
422
  f"newvaldicts[j].get(adm, 0.0) + {str(valdict[adm][i])}"
442
423
  )
443
424
  formula = formula.replace("#population", "#pzbgvjh")
444
425
  for i in sorted_len_indices:
445
- formula = formula.replace(
446
- valcols[i], f"newvaldicts[{i}][adm]"
447
- )
426
+ formula = formula.replace(valcols[i], f"newvaldicts[{i}][adm]")
448
427
  formula = formula.replace("#pzbgvjh", population_str)
449
428
  for adm in valdicts[0]:
450
429
  try:
@@ -470,14 +449,10 @@ class ConfigurableScraper(BaseScraper):
470
449
  header_to_hxltag = self.use_hxl(None, file_headers, iterator)
471
450
  if "source_url" not in self.datasetinfo:
472
451
  self.datasetinfo["source_url"] = self.datasetinfo["url"]
473
- source_date = Sources.standardise_datasetinfo_source_date(
474
- self.datasetinfo
475
- )
452
+ source_date = Sources.standardise_datasetinfo_source_date(self.datasetinfo)
476
453
  if not source_date or self.datasetinfo.get("force_date_today", False):
477
454
  source_date = self.today
478
- self.datasetinfo["source_date"] = {
479
- "default_date": {"end": source_date}
480
- }
455
+ self.datasetinfo["source_date"] = {"default_date": {"end": source_date}}
481
456
  self.rowparser = RowParser(
482
457
  self.name,
483
458
  self.countryiso3s,
@@ -115,9 +115,7 @@ class RowParser:
115
115
  self.maxdates = {i: date for i, _ in enumerate(subsets)}
116
116
  else:
117
117
  if self.datelevel > len(self.admcols):
118
- raise ValueError(
119
- "No admin columns specified for required level_type!"
120
- )
118
+ raise ValueError("No admin columns specified for required level_type!")
121
119
  self.maxdates = {
122
120
  i: {adm: date for adm in self.adms[self.datelevel]}
123
121
  for i, _ in enumerate(subsets)
@@ -150,9 +148,7 @@ class RowParser:
150
148
  header = hxltag.display_tag
151
149
  else:
152
150
  header = hxltag.header
153
- dict_of_lists_add(
154
- self.filters, header, row.get("#country+code")
155
- )
151
+ dict_of_lists_add(self.filters, header, row.get("#country+code"))
156
152
 
157
153
  def get_filter_str_for_eval(self, filter: str) -> str:
158
154
  """Replace filter string variables with columns in row of data
@@ -229,9 +225,7 @@ class RowParser:
229
225
  newrow[self.header_to_hxltag[header]] = row[header]
230
226
  yield newrow
231
227
 
232
- def stop_rows(
233
- self, iterator: Iterator[Dict]
234
- ) -> Generator[Dict, None, None]:
228
+ def stop_rows(self, iterator: Iterator[Dict]) -> Generator[Dict, None, None]:
235
229
  """Stop processing rows after condition met
236
230
 
237
231
  Args:
@@ -46,9 +46,7 @@ class TimeSeries(BaseScraper):
46
46
  datetype = self.datasetinfo["date_type"]
47
47
  ignore_future_date = self.datasetinfo.get("ignore_future_date", True)
48
48
  headers = [datecol] + self.datasetinfo["output"]
49
- hxltags = [self.datasetinfo["date_hxl"]] + self.datasetinfo[
50
- "output_hxl"
51
- ]
49
+ hxltags = [self.datasetinfo["date_hxl"]] + self.datasetinfo["output_hxl"]
52
50
  rows = [headers, hxltags]
53
51
  file_headers, iterator = self.get_reader().read(
54
52
  self.datasetinfo, file_prefix=self.name
@@ -31,8 +31,20 @@ def complete_admins(
31
31
  warnings = []
32
32
  child = None
33
33
  adm_level = len(provider_adm_names)
34
+
35
+ def check_unknown_pcode(adm_code: str, pcode: str) -> str:
36
+ if pcode:
37
+ warnings.append(f"PCode unknown {adm_code}->{pcode} ({warntxt})")
38
+ return pcode
39
+ else:
40
+ warnings.append(f"PCode unknown {adm_code}->''")
41
+ return ""
42
+
34
43
  for i, provider_adm_name in reversed(list(enumerate(provider_adm_names))):
35
44
  adm_code = adm_codes[i]
45
+ parent = admins[i].pcode_to_parent.get(adm_code)
46
+ if not parent and i > 0:
47
+ parent = adm_codes[i - 1]
36
48
  if not provider_adm_name:
37
49
  provider_adm_name = ""
38
50
  provider_adm_names[i] = ""
@@ -40,9 +52,6 @@ def complete_admins(
40
52
  pcode = admins[i + 1].pcode_to_parent.get(child)
41
53
  warntxt = "parent"
42
54
  elif provider_adm_name:
43
- parent = admins[i].pcode_to_parent.get(adm_code)
44
- if not parent and i > 0:
45
- parent = adm_codes[i - 1]
46
55
  pcode, _ = admins[i].get_pcode(
47
56
  countryiso3,
48
57
  provider_adm_name,
@@ -54,24 +63,23 @@ def complete_admins(
54
63
  pcode = None
55
64
  if adm_code:
56
65
  if adm_code not in admins[i].pcodes:
57
- if pcode:
58
- warnings.append(
59
- f"PCode unknown {adm_code}->{pcode} ({warntxt})"
66
+ if admins[i].looks_like_pcode(adm_code):
67
+ adj_adm_code = admins[i].convert_admin_pcode_length(
68
+ countryiso3, adm_code, parent=parent
60
69
  )
61
- adm_code = pcode
70
+ if adj_adm_code:
71
+ warnings.append(f"PCode length {adm_code}->{adj_adm_code}")
72
+ adm_code = adj_adm_code
73
+ else:
74
+ adm_code = check_unknown_pcode(adm_code, pcode)
62
75
  else:
63
- warnings.append(f"PCode unknown {adm_code}->''")
64
- adm_code = ""
76
+ adm_code = check_unknown_pcode(adm_code, pcode)
65
77
  elif pcode and adm_code != pcode:
66
78
  if child:
67
- warnings.append(
68
- f"PCode mismatch {adm_code}->{pcode} ({warntxt})"
69
- )
79
+ warnings.append(f"PCode mismatch {adm_code}->{pcode} ({warntxt})")
70
80
  adm_code = pcode
71
81
  else:
72
- warnings.append(
73
- f"PCode mismatch {adm_code} != {provider_adm_name}"
74
- )
82
+ warnings.append(f"PCode mismatch {adm_code} != {provider_adm_name}")
75
83
  elif pcode:
76
84
  adm_code = pcode
77
85
  else:
@@ -21,9 +21,7 @@ class Lookup:
21
21
  """
22
22
 
23
23
  def __init__(self, yaml_config_path: str, classobject: Type):
24
- configuration = load_yaml(
25
- script_dir_plus_file(yaml_config_path, classobject)
26
- )
24
+ configuration = load_yaml(script_dir_plus_file(yaml_config_path, classobject))
27
25
  self._configuration = configuration
28
26
  initial_lookup = configuration.get("initial_lookup", {})
29
27
  self._code_lookup = copy(initial_lookup)
@@ -91,9 +89,7 @@ class Lookup:
91
89
  unmatched=self._unmatched,
92
90
  )
93
91
 
94
- def get_name(
95
- self, code: str, default: Optional[str] = None
96
- ) -> Optional[str]:
92
+ def get_name(self, code: str, default: Optional[str] = None) -> Optional[str]:
97
93
  """Get name from code
98
94
 
99
95
  Args:
@@ -194,9 +194,7 @@ class Read(Retrieve):
194
194
  today=self.today,
195
195
  )
196
196
 
197
- def setup_tabular(
198
- self, datasetinfo: Dict, kwargs: Dict
199
- ) -> Union[str, List]:
197
+ def setup_tabular(self, datasetinfo: Dict, kwargs: Dict) -> Union[str, List]:
200
198
  """Setup kwargs for tabular source eg. csv, xls, xlsx from
201
199
  datasetinfo and return url.
202
200
 
@@ -330,9 +328,7 @@ class Read(Retrieve):
330
328
  for file_path in sorted(glob.glob(f"{saved_path}_*.json")):
331
329
  datasets.append(Dataset.load_from_json(file_path))
332
330
  else:
333
- datasets = Dataset.search_in_hdx(
334
- query, configuration, page_size, **kwargs
335
- )
331
+ datasets = Dataset.search_in_hdx(query, configuration, page_size, **kwargs)
336
332
  if self.save:
337
333
  for i, dataset in enumerate(datasets):
338
334
  file_path = f"{saved_path}_{i}.json"
@@ -382,9 +378,7 @@ class Read(Retrieve):
382
378
  path = self.download_file(url, **kwargs)
383
379
  return url, path
384
380
 
385
- def download_resource(
386
- self, resource: Resource, **kwargs: Any
387
- ) -> Tuple[str, str]:
381
+ def download_resource(self, resource: Resource, **kwargs: Any) -> Tuple[str, str]:
388
382
  """Download HDX resource os a file and return the url downloaded and
389
383
  the path of the file. The filename of the file comes from the name and
390
384
  format.
@@ -471,9 +465,7 @@ class Read(Retrieve):
471
465
  data.display_tags
472
466
  return data
473
467
  except hxl.HXLException:
474
- logger.warning(
475
- f"Could not process {url}. Maybe there are no HXL tags?"
476
- )
468
+ logger.warning(f"Could not process {url}. Maybe there are no HXL tags?")
477
469
  return None
478
470
  except Exception:
479
471
  logger.exception(f"Error reading {url}!")
@@ -495,14 +487,10 @@ class Read(Retrieve):
495
487
  Optional[Dict]: Information about file or None
496
488
  """
497
489
  try:
498
- _, path = self.construct_filename_and_download(
499
- name, format, url, **kwargs
500
- )
490
+ _, path = self.construct_filename_and_download(name, format, url, **kwargs)
501
491
  return hxl.info(path, InputOptions(allow_local=True))
502
492
  except hxl.HXLException:
503
- logger.warning(
504
- f"Could not process {url}. Maybe there are no HXL tags?"
505
- )
493
+ logger.warning(f"Could not process {url}. Maybe there are no HXL tags?")
506
494
  return None
507
495
  except Exception:
508
496
  logger.exception(f"Error reading {url}!")
@@ -586,23 +574,21 @@ class Read(Retrieve):
586
574
  else:
587
575
  url = resource["url"] # otherwise set the url key in
588
576
  # datasetinfo to the resource url (by setting url here)
589
- datasetinfo["hapi_resource_metadata"] = (
590
- self.get_hapi_resource_metadata(resource)
577
+ datasetinfo["hapi_resource_metadata"] = self.get_hapi_resource_metadata(
578
+ resource
591
579
  )
592
580
  datasetinfo["url"] = url
593
581
  if "source_date" not in datasetinfo:
594
- datasetinfo["source_date"] = (
595
- get_startend_dates_from_time_period(
596
- dataset, today=self.today
597
- )
582
+ datasetinfo["source_date"] = get_startend_dates_from_time_period(
583
+ dataset, today=self.today
598
584
  )
599
585
  if "source" not in datasetinfo:
600
586
  datasetinfo["source"] = dataset["dataset_source"]
601
587
  if "source_url" not in datasetinfo:
602
588
  datasetinfo["source_url"] = dataset.get_hdx_url()
603
589
  Sources.standardise_datasetinfo_source_date(datasetinfo)
604
- datasetinfo["hapi_dataset_metadata"] = (
605
- self.get_hapi_dataset_metadata(dataset, datasetinfo)
590
+ datasetinfo["hapi_dataset_metadata"] = self.get_hapi_dataset_metadata(
591
+ dataset, datasetinfo
606
592
  )
607
593
  return resource
608
594
 
@@ -669,18 +655,14 @@ class Read(Retrieve):
669
655
  Returns:
670
656
  Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
671
657
  """
672
- resource = self.read_hdx_metadata(
673
- datasetinfo, configuration=configuration
674
- )
658
+ resource = self.read_hdx_metadata(datasetinfo, configuration=configuration)
675
659
  filename = kwargs.get("filename")
676
660
  if filename:
677
661
  del kwargs["filename"]
678
662
  datasetinfo["filename"] = filename
679
663
  filename = datasetinfo.get("filename")
680
664
  if resource and not filename:
681
- filename = self.construct_filename(
682
- resource["name"], resource.get_format()
683
- )
665
+ filename = self.construct_filename(resource["name"], resource.get_format())
684
666
  file_prefix = kwargs.get("file_prefix")
685
667
  if not file_prefix:
686
668
  file_prefix = datasetinfo.get("file_prefix")
@@ -708,13 +690,9 @@ class Read(Retrieve):
708
690
  format = datasetinfo["format"]
709
691
  if format in ["json", "csv", "xls", "xlsx"]:
710
692
  if "dataset" in datasetinfo:
711
- headers, iterator = self.read_hdx(
712
- datasetinfo, configuration, **kwargs
713
- )
693
+ headers, iterator = self.read_hdx(datasetinfo, configuration, **kwargs)
714
694
  else:
715
695
  headers, iterator = self.read_tabular(datasetinfo, **kwargs)
716
696
  else:
717
- raise ValueError(
718
- f"Invalid format {format} for {datasetinfo['name']}!"
719
- )
697
+ raise ValueError(f"Invalid format {format} for {datasetinfo['name']}!")
720
698
  return headers, iterator
@@ -91,9 +91,7 @@ class Sources:
91
91
  else:
92
92
  if isinstance(value, dict):
93
93
  for startend, date in value.items():
94
- set_source_date(
95
- date, hxltag=key, startend=startend
96
- )
94
+ set_source_date(date, hxltag=key, startend=startend)
97
95
  else:
98
96
  set_source_date(value, hxltag=key)
99
97
  else:
@@ -214,9 +212,7 @@ class Sources:
214
212
  index = hxltags.index(hxltag)
215
213
  sources[index] = source
216
214
  else:
217
- logger.warning(
218
- f"Keeping existing source information for {hxltag}!"
219
- )
215
+ logger.warning(f"Keeping existing source information for {hxltag}!")
220
216
  else:
221
217
  hxltags.append(hxltag)
222
218
  sources.append(source)
@@ -282,9 +278,7 @@ class Sources:
282
278
  if no_sources:
283
279
  source_configuration["no_sources"] = True
284
280
  return source_configuration
285
- source_configuration["should_overwrite_sources"] = (
286
- should_overwrite_sources
287
- )
281
+ source_configuration["should_overwrite_sources"] = should_overwrite_sources
288
282
  if suffix_attribute:
289
283
  source_configuration["suffix_attribute"] = suffix_attribute
290
284
  return source_configuration
@@ -309,9 +309,7 @@ class Writer:
309
309
 
310
310
  fns.append(region_fn)
311
311
 
312
- rows = self.runner.get_rows(
313
- level, countries, headers, fns, names=names
314
- )
312
+ rows = self.runner.get_rows(level, countries, headers, fns, names=names)
315
313
  if rows:
316
314
  self.update(tab, rows)
317
315
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hdx-python-scraper
3
- Version: 2.6.2
3
+ Version: 2.6.4
4
4
  Summary: HDX Python scraper utilities to assemble data from multiple sources
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -26,15 +26,18 @@ Classifier: Programming Language :: Python :: 3.12
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Requires-Python: >=3.8
28
28
  Requires-Dist: gspread
29
- Requires-Dist: hdx-python-api>=6.3.8
30
- Requires-Dist: hdx-python-country>=3.8.8
31
- Requires-Dist: hdx-python-utilities>=3.8.3
29
+ Requires-Dist: hdx-python-api>=6.3.9
30
+ Requires-Dist: hdx-python-country>=3.9.2
31
+ Requires-Dist: hdx-python-utilities>=3.8.6
32
32
  Requires-Dist: regex
33
33
  Provides-Extra: dev
34
34
  Requires-Dist: pre-commit; extra == 'dev'
35
+ Provides-Extra: docs
36
+ Requires-Dist: mkapi; extra == 'docs'
35
37
  Provides-Extra: pandas
36
- Requires-Dist: pandas>=2.2.2; extra == 'pandas'
38
+ Requires-Dist: pandas>=2.2.3; extra == 'pandas'
37
39
  Provides-Extra: test
40
+ Requires-Dist: pandas>=2.2.3; extra == 'test'
38
41
  Requires-Dist: pytest; extra == 'test'
39
42
  Requires-Dist: pytest-cov; extra == 'test'
40
43
  Description-Content-Type: text/markdown
@@ -0,0 +1,31 @@
1
+ hdx/scraper/framework/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
+ hdx/scraper/framework/_version.py,sha256=a5nalDjLY2yvq7ieXFfR076fN3sJh2mCxFSXqRSIcE0,511
3
+ hdx/scraper/framework/base_scraper.py,sha256=bv9FguvOD40nulgC16zmOsxyg3iAPUDn_zM1V-MVvSY,15292
4
+ hdx/scraper/framework/runner.py,sha256=M6YqiZvOvCewlGn2E0ksslkK7ZHRiWGnRVwQjus805c,53087
5
+ hdx/scraper/framework/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ hdx/scraper/framework/outputs/base.py,sha256=VASnjmw8yM_-c0-G5Ku2gDTuQiYcFxkE27i7jWJfg4c,2563
7
+ hdx/scraper/framework/outputs/excelfile.py,sha256=dkyaI3nKUTn_tpVTbb7NB8F3sKzZQ-7U8l825EdzQ18,2196
8
+ hdx/scraper/framework/outputs/googlesheets.py,sha256=--mri4hhWslfshcVExlobnHgt87aaAtGrmzW2RAk4Ic,3040
9
+ hdx/scraper/framework/outputs/json.py,sha256=NPOMfrG0brIPf3B7NENi-6LdCDbso-K-nPMAVMVa7JU,9455
10
+ hdx/scraper/framework/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ hdx/scraper/framework/scrapers/aggregator.py,sha256=pPKu8QR0_GWRhpSqPBA6bd0KKq-WgN6AEtv9_8gj9d8,14858
12
+ hdx/scraper/framework/scrapers/configurable_scraper.py,sha256=OW_Y5ESVb91hWMtpvZQAqXxP-VyPt9Af5IGvZk0xSuE,19994
13
+ hdx/scraper/framework/scrapers/resource_downloader.py,sha256=ZuSc5L4X4LWcWKniHS5BDmMuM97H8kWCzB1H_PNceGc,1548
14
+ hdx/scraper/framework/scrapers/rowparser.py,sha256=_xImgr6gXXfLRcLu1xEcXmSr6OCCsctkJXfKuzkw97w,15515
15
+ hdx/scraper/framework/scrapers/timeseries.py,sha256=FYk5-MoOnvAa6ym5UWNUWHLpKmJNgHBk1La_nHap18c,3004
16
+ hdx/scraper/framework/utilities/__init__.py,sha256=-zOJzat-fbv427FBIKDnWLs2QStXTBZahiNy_-pgPOc,2390
17
+ hdx/scraper/framework/utilities/fallbacks.py,sha256=t8oKE3_3I6fX4-kzvAdRIhdjg-9vWBGE6shd2_EvC4c,6184
18
+ hdx/scraper/framework/utilities/hapi_admins.py,sha256=XuZXAnkooCLg4tSKZfPqY4bK0rzBALejPxac7IFdwW4,4093
19
+ hdx/scraper/framework/utilities/lookup.py,sha256=WnZa3lY4matfAIsr-GnxurmYndBVbtzbcM9Twm7-4Ho,3483
20
+ hdx/scraper/framework/utilities/org_type.py,sha256=euQyRV01yA8kJ3nMFvZxnTRLnvCuxgV1ZZQx8gEOB8Y,183
21
+ hdx/scraper/framework/utilities/org_type_configuration.yaml,sha256=tTordLPgnE90FSJzbVJPEnE06KyhlQBsPlIu1IAw3iw,1841
22
+ hdx/scraper/framework/utilities/reader.py,sha256=VYi92sAxqZD0nFn9q8OSEEcxXtELNB6DNCF838ES0x4,27415
23
+ hdx/scraper/framework/utilities/region_lookup.py,sha256=82tl1A2GLcxhiTqd1etTpxE5T6anbM-9dHih2ZlN00o,3916
24
+ hdx/scraper/framework/utilities/sector.py,sha256=XGysivvPhTqQfK6z1y96sDJATk3zx7sS_qGqCa4PbaI,177
25
+ hdx/scraper/framework/utilities/sector_configuration.yaml,sha256=VKddsahminPOc3QKKieb1DvaYXkdPdhT5cPAL9_HjDw,4940
26
+ hdx/scraper/framework/utilities/sources.py,sha256=0aW0IbH8nsViDixjD-fIh3gO86vwklYkPU8cXxmJkz4,11379
27
+ hdx/scraper/framework/utilities/writer.py,sha256=yJQ_HcJj-l6DJW5Fl8nr1f3wLIJjogxmF22IU4ysj4c,16673
28
+ hdx_python_scraper-2.6.4.dist-info/METADATA,sha256=fnetPHzz1YtmMbyuOzA97cYlh2GBNowGWiii_adRU-k,3466
29
+ hdx_python_scraper-2.6.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
30
+ hdx_python_scraper-2.6.4.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
31
+ hdx_python_scraper-2.6.4.dist-info/RECORD,,
@@ -1,31 +0,0 @@
1
- hdx/scraper/framework/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
- hdx/scraper/framework/_version.py,sha256=U2b7313v-bM1h69WtyleJ6hXm9RX-9buCsEK4Qgb1fg,411
3
- hdx/scraper/framework/base_scraper.py,sha256=J7AHhDFBehENragRvpZnV8Qi7IcfFql9U_UU1svNr5o,15424
4
- hdx/scraper/framework/runner.py,sha256=an0c_tz46PCnyyUk0dgDV8xfkb7F3LHMUIhPyPSU8sM,53499
5
- hdx/scraper/framework/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- hdx/scraper/framework/outputs/base.py,sha256=VASnjmw8yM_-c0-G5Ku2gDTuQiYcFxkE27i7jWJfg4c,2563
7
- hdx/scraper/framework/outputs/excelfile.py,sha256=dkyaI3nKUTn_tpVTbb7NB8F3sKzZQ-7U8l825EdzQ18,2196
8
- hdx/scraper/framework/outputs/googlesheets.py,sha256=gw9VM2UM3D6N7saUWaXiU2H_ihZn40b6J227I7t-SQs,3086
9
- hdx/scraper/framework/outputs/json.py,sha256=nINV-P5gTGCrtq-zEWYT_Si4ggqhJvUWHCQHxTHUh64,9493
10
- hdx/scraper/framework/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- hdx/scraper/framework/scrapers/aggregator.py,sha256=zC6kmk6hC7xZ-j72R75VEppJKk6kIF5RE18KYBoavX0,14972
12
- hdx/scraper/framework/scrapers/configurable_scraper.py,sha256=Q9AeThIHZwWQetCejrWKOK3RH1Fh1RqXhbU4qDbq2d4,20608
13
- hdx/scraper/framework/scrapers/resource_downloader.py,sha256=ZuSc5L4X4LWcWKniHS5BDmMuM97H8kWCzB1H_PNceGc,1548
14
- hdx/scraper/framework/scrapers/rowparser.py,sha256=j_FxNYzhSB9QmBAzqNhuNVCjlJP4cJKS1l4qLzXG5qE,15613
15
- hdx/scraper/framework/scrapers/timeseries.py,sha256=w0Ejg4nXaLSuq32zx7vlULMZuDZaTqRO94Dm-VKA9co,3026
16
- hdx/scraper/framework/utilities/__init__.py,sha256=-zOJzat-fbv427FBIKDnWLs2QStXTBZahiNy_-pgPOc,2390
17
- hdx/scraper/framework/utilities/fallbacks.py,sha256=t8oKE3_3I6fX4-kzvAdRIhdjg-9vWBGE6shd2_EvC4c,6184
18
- hdx/scraper/framework/utilities/hapi_admins.py,sha256=k_VZtDuSDL3h_0RwZqu9x_fckRnvfQ62KPv5Q-IMVEo,3664
19
- hdx/scraper/framework/utilities/lookup.py,sha256=4IkO35eBN5xz2H5y688C3L80zEvTjrHQgYiM4dFVpPo,3519
20
- hdx/scraper/framework/utilities/org_type.py,sha256=euQyRV01yA8kJ3nMFvZxnTRLnvCuxgV1ZZQx8gEOB8Y,183
21
- hdx/scraper/framework/utilities/org_type_configuration.yaml,sha256=tTordLPgnE90FSJzbVJPEnE06KyhlQBsPlIu1IAw3iw,1841
22
- hdx/scraper/framework/utilities/reader.py,sha256=wM878LqmsuDfsbIGmDmrESRfSfRhJS2POF-dnQ1Ix58,27735
23
- hdx/scraper/framework/utilities/region_lookup.py,sha256=82tl1A2GLcxhiTqd1etTpxE5T6anbM-9dHih2ZlN00o,3916
24
- hdx/scraper/framework/utilities/sector.py,sha256=XGysivvPhTqQfK6z1y96sDJATk3zx7sS_qGqCa4PbaI,177
25
- hdx/scraper/framework/utilities/sector_configuration.yaml,sha256=VKddsahminPOc3QKKieb1DvaYXkdPdhT5cPAL9_HjDw,4940
26
- hdx/scraper/framework/utilities/sources.py,sha256=3miKn_iruWpfpBA-7R9jFt6_EdfX1zvW4PvjifOCd7s,11503
27
- hdx/scraper/framework/utilities/writer.py,sha256=YjOhVo3Ks0I5WH7oyM2Q7fO6ImGabYZ2CBhbYw_A0Kk,16695
28
- hdx_python_scraper-2.6.2.dist-info/METADATA,sha256=KXHMz2OhUVSO1K02K7_bAdY91Nt0sagGcYcIbjr8iKU,3361
29
- hdx_python_scraper-2.6.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
30
- hdx_python_scraper-2.6.2.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
31
- hdx_python_scraper-2.6.2.dist-info/RECORD,,