hdx-python-scraper 2.3.4__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdx/scraper/{__init__.py → framework/__init__.py} +0 -0
- hdx/scraper/{_version.py → framework/_version.py} +2 -2
- hdx/scraper/{base_scraper.py → framework/base_scraper.py} +4 -4
- hdx/scraper/{outputs → framework/outputs}/googlesheets.py +1 -1
- hdx/scraper/{runner.py → framework/runner.py} +35 -13
- hdx/scraper/{configurable/scraper.py → framework/scrapers/configurable_scraper.py} +5 -5
- hdx/scraper/{configurable → framework/scrapers}/rowparser.py +58 -23
- hdx/scraper/{utilities → framework/utilities}/reader.py +93 -22
- hdx/scraper/framework/utilities/sector.py +63 -0
- hdx/scraper/framework/utilities/sector_configuration.yaml +138 -0
- hdx/scraper/{utilities → framework/utilities}/sources.py +3 -3
- {hdx_python_scraper-2.3.4.dist-info → hdx_python_scraper-2.5.2.dist-info}/METADATA +6 -5
- hdx_python_scraper-2.5.2.dist-info/RECORD +27 -0
- {hdx_python_scraper-2.3.4.dist-info → hdx_python_scraper-2.5.2.dist-info}/WHEEL +1 -1
- hdx_python_scraper-2.3.4.dist-info/RECORD +0 -25
- /hdx/scraper/{configurable → framework/outputs}/__init__.py +0 -0
- /hdx/scraper/{outputs → framework/outputs}/base.py +0 -0
- /hdx/scraper/{outputs → framework/outputs}/excelfile.py +0 -0
- /hdx/scraper/{outputs → framework/outputs}/json.py +0 -0
- /hdx/scraper/{outputs → framework/scrapers}/__init__.py +0 -0
- /hdx/scraper/{configurable → framework/scrapers}/aggregator.py +0 -0
- /hdx/scraper/{configurable → framework/scrapers}/resource_downloader.py +0 -0
- /hdx/scraper/{configurable → framework/scrapers}/timeseries.py +0 -0
- /hdx/scraper/{utilities → framework/utilities}/__init__.py +0 -0
- /hdx/scraper/{utilities → framework/utilities}/fallbacks.py +0 -0
- /hdx/scraper/{utilities → framework/utilities}/region_lookup.py +0 -0
- /hdx/scraper/{utilities → framework/utilities}/writer.py +0 -0
- {hdx_python_scraper-2.3.4.dist-info → hdx_python_scraper-2.5.2.dist-info}/licenses/LICENSE +0 -0
|
File without changes
|
|
@@ -36,7 +36,7 @@ class BaseScraper(ABC):
|
|
|
36
36
|
self.reader = datasetinfo.get("reader", name)
|
|
37
37
|
self.setup(headers, source_configuration)
|
|
38
38
|
self.datasetinfo = deepcopy(datasetinfo)
|
|
39
|
-
self.
|
|
39
|
+
self.error_handler = None
|
|
40
40
|
self.can_fallback = True
|
|
41
41
|
|
|
42
42
|
def setup(
|
|
@@ -141,9 +141,9 @@ class BaseScraper(ABC):
|
|
|
141
141
|
"should_overwrite_sources"
|
|
142
142
|
)
|
|
143
143
|
if should_overwrite_sources is not None:
|
|
144
|
-
self.source_configuration[
|
|
145
|
-
|
|
146
|
-
|
|
144
|
+
self.source_configuration["should_overwrite_sources"] = (
|
|
145
|
+
should_overwrite_sources
|
|
146
|
+
)
|
|
147
147
|
source = self.datasetinfo["source"]
|
|
148
148
|
if isinstance(source, str):
|
|
149
149
|
source = {"default_source": source}
|
|
@@ -5,18 +5,18 @@ from traceback import format_exc
|
|
|
5
5
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
from .base_scraper import BaseScraper
|
|
8
|
-
from .configurable.aggregator import Aggregator
|
|
9
|
-
from .configurable.resource_downloader import ResourceDownloader
|
|
10
|
-
from .configurable.scraper import ConfigurableScraper
|
|
11
|
-
from .configurable.timeseries import TimeSeries
|
|
12
8
|
from .outputs.base import BaseOutput
|
|
9
|
+
from .scrapers.aggregator import Aggregator
|
|
10
|
+
from .scrapers.configurable_scraper import ConfigurableScraper
|
|
11
|
+
from .scrapers.resource_downloader import ResourceDownloader
|
|
12
|
+
from .scrapers.timeseries import TimeSeries
|
|
13
13
|
from .utilities import get_startend_dates_from_time_period
|
|
14
14
|
from .utilities.fallbacks import Fallbacks
|
|
15
15
|
from .utilities.reader import Read
|
|
16
16
|
from .utilities.sources import Sources
|
|
17
17
|
from hdx.location.adminlevel import AdminLevel
|
|
18
18
|
from hdx.utilities.dateparse import now_utc
|
|
19
|
-
from hdx.utilities.
|
|
19
|
+
from hdx.utilities.error_handler import ErrorHandler
|
|
20
20
|
from hdx.utilities.typehint import ListTuple
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
@@ -28,7 +28,7 @@ class Runner:
|
|
|
28
28
|
Args:
|
|
29
29
|
countryiso3s (ListTuple[str]): List of ISO3 country codes to process
|
|
30
30
|
today (datetime): Value to use for today. Defaults to now_utc().
|
|
31
|
-
|
|
31
|
+
error_handler (ErrorHandler): ErrorHandler object that logs errors on exit
|
|
32
32
|
scrapers_to_run (Optional[ListTuple[str]]): Scrapers to run. Defaults to None (all scrapers).
|
|
33
33
|
"""
|
|
34
34
|
|
|
@@ -36,12 +36,12 @@ class Runner:
|
|
|
36
36
|
self,
|
|
37
37
|
countryiso3s: ListTuple[str],
|
|
38
38
|
today: datetime = now_utc(),
|
|
39
|
-
|
|
39
|
+
error_handler: Optional[ErrorHandler] = None,
|
|
40
40
|
scrapers_to_run: Optional[ListTuple[str]] = None,
|
|
41
41
|
):
|
|
42
42
|
self.countryiso3s = countryiso3s
|
|
43
43
|
self.today = today
|
|
44
|
-
self.
|
|
44
|
+
self.error_handler = error_handler
|
|
45
45
|
if isinstance(scrapers_to_run, tuple):
|
|
46
46
|
scrapers_to_run = list(scrapers_to_run)
|
|
47
47
|
self.scrapers_to_run: Optional[List[str]] = scrapers_to_run
|
|
@@ -73,7 +73,7 @@ class Runner:
|
|
|
73
73
|
and scraper_name not in self.scrapers_to_run
|
|
74
74
|
):
|
|
75
75
|
self.scrapers_to_run.append(scraper_name)
|
|
76
|
-
scraper.
|
|
76
|
+
scraper.error_handler = self.error_handler
|
|
77
77
|
return scraper_name
|
|
78
78
|
|
|
79
79
|
def add_customs(
|
|
@@ -106,6 +106,7 @@ class Runner:
|
|
|
106
106
|
source_configuration: Dict = {},
|
|
107
107
|
suffix: Optional[str] = None,
|
|
108
108
|
force_add_to_run: bool = False,
|
|
109
|
+
countryiso3s: Optional[List[str]] = None,
|
|
109
110
|
) -> str:
|
|
110
111
|
"""Add configurable scraper to the run. If running specific scrapers rather than
|
|
111
112
|
all, and you want to force the inclusion of the scraper in the run regardless of
|
|
@@ -121,6 +122,7 @@ class Runner:
|
|
|
121
122
|
source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
|
|
122
123
|
suffix (Optional[str]): Suffix to add to the scraper name
|
|
123
124
|
force_add_to_run (bool): Whether to force include the scraper in the next run
|
|
125
|
+
countryiso3s (Optional[List[str]]): Override list of country iso3s. Defaults to None.
|
|
124
126
|
|
|
125
127
|
Returns:
|
|
126
128
|
str: scraper name (including suffix if set)
|
|
@@ -129,16 +131,18 @@ class Runner:
|
|
|
129
131
|
scraper_name = f"{name}{suffix}"
|
|
130
132
|
else:
|
|
131
133
|
scraper_name = name
|
|
134
|
+
if not countryiso3s:
|
|
135
|
+
countryiso3s = self.countryiso3s
|
|
132
136
|
self.scrapers[scraper_name] = ConfigurableScraper(
|
|
133
137
|
name,
|
|
134
138
|
datasetinfo,
|
|
135
139
|
level,
|
|
136
|
-
|
|
140
|
+
countryiso3s,
|
|
137
141
|
adminlevel,
|
|
138
142
|
level_name,
|
|
139
143
|
source_configuration,
|
|
140
144
|
self.today,
|
|
141
|
-
self.
|
|
145
|
+
self.error_handler,
|
|
142
146
|
)
|
|
143
147
|
if scraper_name not in self.scraper_names:
|
|
144
148
|
self.scraper_names.append(scraper_name)
|
|
@@ -159,6 +163,7 @@ class Runner:
|
|
|
159
163
|
source_configuration: Dict = {},
|
|
160
164
|
suffix: Optional[str] = None,
|
|
161
165
|
force_add_to_run: bool = False,
|
|
166
|
+
countryiso3s: Optional[List[str]] = None,
|
|
162
167
|
) -> List[str]:
|
|
163
168
|
"""Add multiple configurable scrapers to the run. If running specific scrapers
|
|
164
169
|
rather than all, and you want to force the inclusion of the scraper in the run
|
|
@@ -173,6 +178,7 @@ class Runner:
|
|
|
173
178
|
source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
|
|
174
179
|
suffix (Optional[str]): Suffix to add to the scraper name
|
|
175
180
|
force_add_to_run (bool): Whether to force include the scraper in the next run
|
|
181
|
+
countryiso3s (Optional[List[str]]): Override list of country iso3s. Defaults to None.
|
|
176
182
|
|
|
177
183
|
Returns:
|
|
178
184
|
List[str]: scraper names (including suffix if set)
|
|
@@ -190,6 +196,7 @@ class Runner:
|
|
|
190
196
|
source_configuration,
|
|
191
197
|
suffix,
|
|
192
198
|
force_add_to_run,
|
|
199
|
+
countryiso3s,
|
|
193
200
|
)
|
|
194
201
|
)
|
|
195
202
|
return keys
|
|
@@ -516,6 +523,21 @@ class Runner:
|
|
|
516
523
|
raise ValueError(f"No such scraper {name}!")
|
|
517
524
|
return scraper
|
|
518
525
|
|
|
526
|
+
def delete_scraper(self, name: str) -> bool:
|
|
527
|
+
"""Delete scraper with given name
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
name (str): Name of scraper
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
bool: True if the scraper was present, False if not
|
|
534
|
+
"""
|
|
535
|
+
if name not in self.scraper_names:
|
|
536
|
+
return False
|
|
537
|
+
self.scraper_names.remove(name)
|
|
538
|
+
del self.scrapers[name]
|
|
539
|
+
return True
|
|
540
|
+
|
|
519
541
|
def add_instance_variables(self, name: str, **kwargs: Any) -> None:
|
|
520
542
|
"""Add instance variables to scraper instance given scraper name
|
|
521
543
|
|
|
@@ -590,8 +612,8 @@ class Runner:
|
|
|
590
612
|
if not Fallbacks.exist() or scraper.can_fallback is False:
|
|
591
613
|
raise
|
|
592
614
|
logger.exception(f"Using fallbacks for {scraper.name}!")
|
|
593
|
-
if self.
|
|
594
|
-
self.
|
|
615
|
+
if self.error_handler:
|
|
616
|
+
self.error_handler.add(
|
|
595
617
|
f"Using fallbacks for {scraper.name}! Error: {format_exc()}"
|
|
596
618
|
)
|
|
597
619
|
for level in scraper.headers.keys():
|
|
@@ -17,7 +17,7 @@ from hdx.utilities.dateparse import (
|
|
|
17
17
|
)
|
|
18
18
|
from hdx.utilities.dictandlist import dict_of_lists_add
|
|
19
19
|
from hdx.utilities.downloader import DownloadError
|
|
20
|
-
from hdx.utilities.
|
|
20
|
+
from hdx.utilities.error_handler import ErrorHandler
|
|
21
21
|
from hdx.utilities.text import ( # noqa: F401
|
|
22
22
|
get_fraction_str,
|
|
23
23
|
get_numeric_if_possible,
|
|
@@ -42,7 +42,7 @@ class ConfigurableScraper(BaseScraper):
|
|
|
42
42
|
level_name (Optional[str]): Customised level_name name. Defaults to None (level).
|
|
43
43
|
source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
|
|
44
44
|
today (datetime): Value to use for today. Defaults to now_utc().
|
|
45
|
-
|
|
45
|
+
error_handler (Optional[ErrorHandler]): ErrorHandler object that logs errors on exit
|
|
46
46
|
**kwargs: Variables to use when evaluating template arguments in urls
|
|
47
47
|
"""
|
|
48
48
|
|
|
@@ -67,7 +67,7 @@ class ConfigurableScraper(BaseScraper):
|
|
|
67
67
|
level_name: Optional[str] = None,
|
|
68
68
|
source_configuration: Dict = {},
|
|
69
69
|
today: datetime = now_utc(),
|
|
70
|
-
|
|
70
|
+
error_handler: Optional[ErrorHandler] = None,
|
|
71
71
|
**kwargs: Any,
|
|
72
72
|
):
|
|
73
73
|
self.name = name
|
|
@@ -83,10 +83,10 @@ class ConfigurableScraper(BaseScraper):
|
|
|
83
83
|
else:
|
|
84
84
|
self.level_name: str = level_name
|
|
85
85
|
self.countryiso3s = countryiso3s
|
|
86
|
-
self.adminlevel = adminlevel
|
|
86
|
+
self.adminlevel: Optional[AdminLevel] = adminlevel
|
|
87
87
|
self.today = today
|
|
88
88
|
self.subsets = self.get_subsets_from_datasetinfo(datasetinfo)
|
|
89
|
-
self.
|
|
89
|
+
self.error_handler: Optional[ErrorHandler] = error_handler
|
|
90
90
|
self.variables = kwargs
|
|
91
91
|
self.rowparser = None
|
|
92
92
|
self.datasetinfo = copy.deepcopy(datasetinfo)
|
|
@@ -185,20 +185,14 @@ class RowParser:
|
|
|
185
185
|
Returns:
|
|
186
186
|
Iterator[Dict]: Input data with prefilter applied if specified and sorted if specified or deemed necessary
|
|
187
187
|
"""
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
if self.
|
|
196
|
-
if all(
|
|
197
|
-
row[key] == value for key, value in self.stop_row.items()
|
|
198
|
-
):
|
|
199
|
-
break
|
|
200
|
-
for newrow in self.flatten(row):
|
|
201
|
-
rows.append(newrow)
|
|
188
|
+
if self.header_to_hxltag:
|
|
189
|
+
iterator = self.header_to_hxltag_rows(iterator)
|
|
190
|
+
if self.stop_row:
|
|
191
|
+
iterator = self.stop_rows(iterator)
|
|
192
|
+
if self.flatteninfo:
|
|
193
|
+
iterator = self.flatten_rows(iterator)
|
|
194
|
+
if self.prefilter:
|
|
195
|
+
iterator = (row for row in iterator if eval(self.prefilter))
|
|
202
196
|
if not self.sort:
|
|
203
197
|
if self.datecol:
|
|
204
198
|
for subset in self.subsets:
|
|
@@ -212,15 +206,59 @@ class RowParser:
|
|
|
212
206
|
)
|
|
213
207
|
self.sort = {"keys": [self.datecol], "reverse": True}
|
|
214
208
|
break
|
|
215
|
-
if self.prefilter:
|
|
216
|
-
rows = [row for row in rows if eval(self.prefilter)]
|
|
217
209
|
if self.sort:
|
|
218
210
|
keys = self.sort["keys"]
|
|
219
211
|
reverse = self.sort.get("reverse", False)
|
|
220
|
-
|
|
221
|
-
return
|
|
212
|
+
iterator = sorted(iterator, key=itemgetter(*keys), reverse=reverse)
|
|
213
|
+
return iterator
|
|
214
|
+
|
|
215
|
+
def header_to_hxltag_rows(
|
|
216
|
+
self, iterator: Iterator[Dict]
|
|
217
|
+
) -> Generator[Dict, None, None]:
|
|
218
|
+
"""Convert headers to HXL tags in keys
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
iterator (Iterator[Dict]): Input data
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Generator[Dict]: Rows where keys are HXL tags
|
|
225
|
+
"""
|
|
226
|
+
for row in iterator:
|
|
227
|
+
newrow = {}
|
|
228
|
+
for header in row:
|
|
229
|
+
newrow[self.header_to_hxltag[header]] = row[header]
|
|
230
|
+
yield newrow
|
|
231
|
+
|
|
232
|
+
def stop_rows(
|
|
233
|
+
self, iterator: Iterator[Dict]
|
|
234
|
+
) -> Generator[Dict, None, None]:
|
|
235
|
+
"""Stop processing rows after condition met
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
iterator (Iterator[Dict]): Input data
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Generator[Dict]: Rows up to stop condition
|
|
242
|
+
"""
|
|
243
|
+
for row in iterator:
|
|
244
|
+
if all(row[key] == value for key, value in self.stop_row.items()):
|
|
245
|
+
break
|
|
246
|
+
yield row
|
|
247
|
+
|
|
248
|
+
def flatten_rows(self, iterator: Iterator[Dict]) -> Iterator[Dict]:
|
|
249
|
+
"""Flatten rows
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
iterator (Iterator[Dict]): Input data
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Generator[Dict]: Flattened rows
|
|
256
|
+
"""
|
|
257
|
+
for row in iterator:
|
|
258
|
+
for newrow in self.flatten_row(row):
|
|
259
|
+
yield newrow
|
|
222
260
|
|
|
223
|
-
def
|
|
261
|
+
def flatten_row(self, row: Dict) -> Generator[Dict, None, None]:
|
|
224
262
|
"""Flatten a wide spreadsheet format into a long one
|
|
225
263
|
|
|
226
264
|
Args:
|
|
@@ -229,9 +267,6 @@ class RowParser:
|
|
|
229
267
|
Returns:
|
|
230
268
|
Generator[Dict]: Flattened row(s)
|
|
231
269
|
"""
|
|
232
|
-
if not self.flatteninfo:
|
|
233
|
-
yield row
|
|
234
|
-
return
|
|
235
270
|
counters = [-1 for _ in self.flatteninfo]
|
|
236
271
|
while True:
|
|
237
272
|
newrow = copy.deepcopy(row)
|
|
@@ -314,7 +349,7 @@ class RowParser:
|
|
|
314
349
|
adms[i], exact = Country.get_iso3_country_code_fuzzy(adm)
|
|
315
350
|
elif i == 1:
|
|
316
351
|
adms[i], exact = self.adminlevel.get_pcode(
|
|
317
|
-
adms[0], adm, self.name
|
|
352
|
+
adms[0], adm, logname=self.name
|
|
318
353
|
)
|
|
319
354
|
if adms[i] not in self.adms[i]:
|
|
320
355
|
adms[i] = None
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import logging
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from os.path import join
|
|
@@ -10,6 +11,7 @@ from slugify import slugify
|
|
|
10
11
|
|
|
11
12
|
from . import get_startend_dates_from_time_period, match_template
|
|
12
13
|
from .sources import Sources
|
|
14
|
+
from hdx.api.configuration import Configuration
|
|
13
15
|
from hdx.data.dataset import Dataset
|
|
14
16
|
from hdx.data.resource import Resource
|
|
15
17
|
from hdx.utilities.dateparse import parse_date
|
|
@@ -204,15 +206,19 @@ class Read(Retrieve):
|
|
|
204
206
|
if headers is None:
|
|
205
207
|
headers = 1
|
|
206
208
|
datasetinfo["headers"] = 1
|
|
207
|
-
kwargs["headers"] = headers
|
|
208
|
-
if isinstance(headers, list):
|
|
209
|
-
kwargs["fill_merged_cells"] = True
|
|
210
209
|
format = datasetinfo["format"]
|
|
211
210
|
kwargs["format"] = format
|
|
212
|
-
if
|
|
213
|
-
|
|
211
|
+
if format in ("xls", "xlsx"):
|
|
212
|
+
if not sheet:
|
|
213
|
+
sheet = 1
|
|
214
|
+
if isinstance(headers, list):
|
|
215
|
+
kwargs["fill_merged_cells"] = True
|
|
216
|
+
elif "fill_merged_cells" not in kwargs:
|
|
217
|
+
kwargs["fill_merged_cells"] = False
|
|
218
|
+
kwargs["xlsx2csv"] = datasetinfo.get("xlsx2csv", False)
|
|
214
219
|
if sheet:
|
|
215
220
|
kwargs["sheet"] = sheet
|
|
221
|
+
kwargs["headers"] = headers
|
|
216
222
|
compression = datasetinfo.get("compression")
|
|
217
223
|
if compression:
|
|
218
224
|
kwargs["compression"] = compression
|
|
@@ -238,11 +244,14 @@ class Read(Retrieve):
|
|
|
238
244
|
**kwargs,
|
|
239
245
|
)
|
|
240
246
|
|
|
241
|
-
def read_dataset(
|
|
247
|
+
def read_dataset(
|
|
248
|
+
self, dataset_name: str, configuration: Optional[Configuration] = None
|
|
249
|
+
) -> Optional[Dataset]:
|
|
242
250
|
"""Read HDX dataset
|
|
243
251
|
|
|
244
252
|
Args:
|
|
245
253
|
dataset_name (str): Dataset name
|
|
254
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
246
255
|
|
|
247
256
|
Returns:
|
|
248
257
|
Optional[Dataset]: The dataset that was read or None
|
|
@@ -252,7 +261,7 @@ class Read(Retrieve):
|
|
|
252
261
|
logger.info(f"Using saved dataset {dataset_name} in {saved_path}")
|
|
253
262
|
dataset = Dataset.load_from_json(saved_path)
|
|
254
263
|
else:
|
|
255
|
-
dataset = Dataset.read_from_hdx(dataset_name)
|
|
264
|
+
dataset = Dataset.read_from_hdx(dataset_name, configuration)
|
|
256
265
|
if self.save:
|
|
257
266
|
logger.info(f"Saving dataset {dataset_name} in {saved_path}")
|
|
258
267
|
if dataset is None:
|
|
@@ -261,6 +270,56 @@ class Read(Retrieve):
|
|
|
261
270
|
dataset.save_to_json(saved_path, follow_urls=True)
|
|
262
271
|
return dataset
|
|
263
272
|
|
|
273
|
+
def search_datasets(
|
|
274
|
+
self,
|
|
275
|
+
filename: str,
|
|
276
|
+
query: Optional[str] = "*:*",
|
|
277
|
+
configuration: Optional[Configuration] = None,
|
|
278
|
+
page_size: int = 1000,
|
|
279
|
+
**kwargs: Any,
|
|
280
|
+
) -> List[Dataset]:
|
|
281
|
+
"""Read HDX dataset
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
filename (str): Filename for saved files. Will be prefixed by underscore and a number.
|
|
285
|
+
query (Optional[str]): Query (in Solr format). Defaults to '*:*'.
|
|
286
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
287
|
+
page_size (int): Size of page to return. Defaults to 1000.
|
|
288
|
+
**kwargs: See below
|
|
289
|
+
fq (string): Any filter queries to apply
|
|
290
|
+
rows (int): Number of matching rows to return. Defaults to all datasets (sys.maxsize).
|
|
291
|
+
start (int): Offset in the complete result for where the set of returned datasets should begin
|
|
292
|
+
sort (string): Sorting of results. Defaults to 'relevance asc, metadata_modified desc' if rows<=page_size or 'metadata_modified asc' if rows>page_size.
|
|
293
|
+
facet (string): Whether to enable faceted results. Default to True.
|
|
294
|
+
facet.mincount (int): Minimum counts for facet fields should be included in the results
|
|
295
|
+
facet.limit (int): Maximum number of values the facet fields return (- = unlimited). Defaults to 50.
|
|
296
|
+
facet.field (List[str]): Fields to facet upon. Default is empty.
|
|
297
|
+
use_default_schema (bool): Use default package schema instead of custom schema. Defaults to False.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
List[Dataset]: list of datasets resulting from query
|
|
301
|
+
"""
|
|
302
|
+
|
|
303
|
+
saved_path = join(self.saved_dir, filename)
|
|
304
|
+
if self.use_saved:
|
|
305
|
+
logger.info(
|
|
306
|
+
f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
|
|
307
|
+
)
|
|
308
|
+
datasets = []
|
|
309
|
+
for file_path in sorted(glob.glob(f"{saved_path}_*.json")):
|
|
310
|
+
datasets.append(Dataset.load_from_json(file_path))
|
|
311
|
+
else:
|
|
312
|
+
datasets = Dataset.search_in_hdx(
|
|
313
|
+
query, configuration, page_size, **kwargs
|
|
314
|
+
)
|
|
315
|
+
if self.save:
|
|
316
|
+
for i, dataset in enumerate(datasets):
|
|
317
|
+
file_path = f"{saved_path}_{i}.json"
|
|
318
|
+
name = dataset["name"]
|
|
319
|
+
logger.info(f"Saving dataset {name} in {file_path}")
|
|
320
|
+
dataset.save_to_json(file_path, follow_urls=True)
|
|
321
|
+
return datasets
|
|
322
|
+
|
|
264
323
|
@staticmethod
|
|
265
324
|
def construct_filename(name: str, format: str):
|
|
266
325
|
"""Construct filename from name and format. The filename of the file
|
|
@@ -438,7 +497,10 @@ class Read(Retrieve):
|
|
|
438
497
|
return self.hxl_info_file(name, format, url, **kwargs)
|
|
439
498
|
|
|
440
499
|
def read_hdx_metadata(
|
|
441
|
-
self,
|
|
500
|
+
self,
|
|
501
|
+
datasetinfo: Dict,
|
|
502
|
+
do_resource_check: bool = True,
|
|
503
|
+
configuration: Optional[Configuration] = None,
|
|
442
504
|
) -> Optional[Resource]:
|
|
443
505
|
"""Read metadata from HDX dataset and add to input dictionary. If url
|
|
444
506
|
is not supplied, will look through resources for one that matches
|
|
@@ -454,13 +516,14 @@ class Read(Retrieve):
|
|
|
454
516
|
Args:
|
|
455
517
|
datasetinfo (Dict): Dictionary of information about dataset
|
|
456
518
|
do_resource_check (bool): Whether to check resources. Defaults to False.
|
|
519
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
457
520
|
|
|
458
521
|
Returns:
|
|
459
522
|
Optional[Resource]: The resource if a url was not given
|
|
460
523
|
"""
|
|
461
524
|
dataset_nameinfo = datasetinfo["dataset"]
|
|
462
525
|
if isinstance(dataset_nameinfo, str):
|
|
463
|
-
dataset = self.read_dataset(dataset_nameinfo)
|
|
526
|
+
dataset = self.read_dataset(dataset_nameinfo, configuration)
|
|
464
527
|
resource = None
|
|
465
528
|
url = datasetinfo.get("url")
|
|
466
529
|
resource_name = datasetinfo.get("resource")
|
|
@@ -491,24 +554,24 @@ class Read(Retrieve):
|
|
|
491
554
|
else:
|
|
492
555
|
url = resource["url"] # otherwise set the url key in
|
|
493
556
|
# datasetinfo to the resource url (by setting url here)
|
|
494
|
-
datasetinfo[
|
|
495
|
-
|
|
496
|
-
|
|
557
|
+
datasetinfo["hapi_resource_metadata"] = (
|
|
558
|
+
self.get_hapi_resource_metadata(resource)
|
|
559
|
+
)
|
|
497
560
|
datasetinfo["url"] = url
|
|
498
561
|
if "source_date" not in datasetinfo:
|
|
499
|
-
datasetinfo[
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
562
|
+
datasetinfo["source_date"] = (
|
|
563
|
+
get_startend_dates_from_time_period(
|
|
564
|
+
dataset, today=self.today
|
|
565
|
+
)
|
|
503
566
|
)
|
|
504
567
|
if "source" not in datasetinfo:
|
|
505
568
|
datasetinfo["source"] = dataset["dataset_source"]
|
|
506
569
|
if "source_url" not in datasetinfo:
|
|
507
570
|
datasetinfo["source_url"] = dataset.get_hdx_url()
|
|
508
571
|
Sources.standardise_datasetinfo_source_date(datasetinfo)
|
|
509
|
-
datasetinfo[
|
|
510
|
-
|
|
511
|
-
|
|
572
|
+
datasetinfo["hapi_dataset_metadata"] = (
|
|
573
|
+
self.get_hapi_dataset_metadata(dataset, datasetinfo)
|
|
574
|
+
)
|
|
512
575
|
return resource
|
|
513
576
|
|
|
514
577
|
if "source_date" not in datasetinfo:
|
|
@@ -527,7 +590,7 @@ class Read(Retrieve):
|
|
|
527
590
|
for hxltag, dataset_name in dataset_nameinfo.items():
|
|
528
591
|
dataset = datasets.get(dataset_name)
|
|
529
592
|
if not dataset:
|
|
530
|
-
dataset = self.read_dataset(dataset_name)
|
|
593
|
+
dataset = self.read_dataset(dataset_name, configuration)
|
|
531
594
|
datasets[dataset_name] = dataset
|
|
532
595
|
if source_date is not None:
|
|
533
596
|
if hxltag == "default_dataset":
|
|
@@ -561,18 +624,22 @@ class Read(Retrieve):
|
|
|
561
624
|
def read_hdx(
|
|
562
625
|
self,
|
|
563
626
|
datasetinfo: Dict,
|
|
627
|
+
configuration: Optional[Configuration] = None,
|
|
564
628
|
**kwargs: Any,
|
|
565
629
|
) -> Tuple[List[str], Iterator[Dict]]:
|
|
566
630
|
"""Read data and metadata from HDX dataset
|
|
567
631
|
|
|
568
632
|
Args:
|
|
569
633
|
datasetinfo (Dict): Dictionary of information about dataset
|
|
634
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
570
635
|
**kwargs: Parameters to pass to download_file call
|
|
571
636
|
|
|
572
637
|
Returns:
|
|
573
638
|
Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
|
|
574
639
|
"""
|
|
575
|
-
resource = self.read_hdx_metadata(
|
|
640
|
+
resource = self.read_hdx_metadata(
|
|
641
|
+
datasetinfo, configuration=configuration
|
|
642
|
+
)
|
|
576
643
|
filename = kwargs.get("filename")
|
|
577
644
|
if filename:
|
|
578
645
|
del kwargs["filename"]
|
|
@@ -593,12 +660,14 @@ class Read(Retrieve):
|
|
|
593
660
|
def read(
|
|
594
661
|
self,
|
|
595
662
|
datasetinfo: Dict,
|
|
663
|
+
configuration: Optional[Configuration] = None,
|
|
596
664
|
**kwargs: Any,
|
|
597
665
|
) -> Tuple[List[str], Iterator[Dict]]:
|
|
598
666
|
"""Read data and metadata from HDX dataset
|
|
599
667
|
|
|
600
668
|
Args:
|
|
601
669
|
datasetinfo (Dict): Dictionary of information about dataset
|
|
670
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
602
671
|
**kwargs: Parameters to pass to download_file call
|
|
603
672
|
|
|
604
673
|
Returns:
|
|
@@ -607,7 +676,9 @@ class Read(Retrieve):
|
|
|
607
676
|
format = datasetinfo["format"]
|
|
608
677
|
if format in ["json", "csv", "xls", "xlsx"]:
|
|
609
678
|
if "dataset" in datasetinfo:
|
|
610
|
-
headers, iterator = self.read_hdx(
|
|
679
|
+
headers, iterator = self.read_hdx(
|
|
680
|
+
datasetinfo, configuration, **kwargs
|
|
681
|
+
)
|
|
611
682
|
else:
|
|
612
683
|
headers, iterator = self.read_tabular(datasetinfo, **kwargs)
|
|
613
684
|
else:
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Populate the sector mapping."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from copy import copy
|
|
5
|
+
from typing import Dict, Optional
|
|
6
|
+
|
|
7
|
+
from .reader import Read
|
|
8
|
+
from hdx.utilities.loader import load_yaml
|
|
9
|
+
from hdx.utilities.matching import get_code_from_name
|
|
10
|
+
from hdx.utilities.path import script_dir_plus_file
|
|
11
|
+
from hdx.utilities.text import normalise
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Sector:
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
configuration: Optional[Dict] = None,
|
|
20
|
+
):
|
|
21
|
+
if configuration is None:
|
|
22
|
+
configuration = load_yaml(
|
|
23
|
+
script_dir_plus_file("sector_configuration.yaml", Sector)
|
|
24
|
+
)
|
|
25
|
+
self._datasetinfo = configuration["sector"]
|
|
26
|
+
self.data = copy(configuration["sector_map"])
|
|
27
|
+
self.unmatched = []
|
|
28
|
+
self.populate()
|
|
29
|
+
|
|
30
|
+
def populate(self) -> None:
|
|
31
|
+
logger.info("Populating sector mapping")
|
|
32
|
+
|
|
33
|
+
def parse_sector_values(code: str, name: str):
|
|
34
|
+
self.data[name] = code
|
|
35
|
+
self.data[code] = code
|
|
36
|
+
self.data[normalise(name)] = code
|
|
37
|
+
self.data[normalise(code)] = code
|
|
38
|
+
|
|
39
|
+
reader = Read.get_reader()
|
|
40
|
+
headers, iterator = reader.read(
|
|
41
|
+
self._datasetinfo, file_prefix="sector"
|
|
42
|
+
)
|
|
43
|
+
for row in iterator:
|
|
44
|
+
parse_sector_values(
|
|
45
|
+
code=row["#sector +code +acronym"],
|
|
46
|
+
name=row["#sector +name +preferred +i_en"],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
extra_entries = {
|
|
50
|
+
"Cash": "Cash programming",
|
|
51
|
+
"Hum": "Humanitarian assistance (unspecified)",
|
|
52
|
+
"Multi": "Multi-sector (unspecified)",
|
|
53
|
+
"Intersectoral": "Intersectoral",
|
|
54
|
+
}
|
|
55
|
+
for code, name in extra_entries.items():
|
|
56
|
+
parse_sector_values(code=code, name=name)
|
|
57
|
+
|
|
58
|
+
def get_sector_code(self, sector: str) -> str | None:
|
|
59
|
+
return get_code_from_name(
|
|
60
|
+
name=sector,
|
|
61
|
+
code_lookup=self.data,
|
|
62
|
+
unmatched=self.unmatched,
|
|
63
|
+
)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
sector:
|
|
2
|
+
dataset: "global-coordination-groups-beta"
|
|
3
|
+
resource: "Global Coordination Groups (Beta) CSV"
|
|
4
|
+
format: "csv"
|
|
5
|
+
headers: 2
|
|
6
|
+
|
|
7
|
+
sector_map:
|
|
8
|
+
abna: "SHL"
|
|
9
|
+
abri: "SHL"
|
|
10
|
+
abri bna: "SHL"
|
|
11
|
+
abris: "SHL"
|
|
12
|
+
abris ame: "SHL"
|
|
13
|
+
abris bna: "SHL"
|
|
14
|
+
abris bna cccm: "SHL"
|
|
15
|
+
abris durgence et nfi: "SHL"
|
|
16
|
+
abris nfi: "SHL"
|
|
17
|
+
action contre les mines: "PRO-MIN"
|
|
18
|
+
aee: "SHL"
|
|
19
|
+
agriculture: "FSC"
|
|
20
|
+
agua saneamiento e higiene: "WSH"
|
|
21
|
+
all: "Intersectoral"
|
|
22
|
+
alojamiento de emergencia: "SHL"
|
|
23
|
+
alojamiento de emergencia shelter: "SHL"
|
|
24
|
+
alojamientos y asentamientos: "SHL"
|
|
25
|
+
ame: "SHL"
|
|
26
|
+
ash: "WSH"
|
|
27
|
+
assainissement: "WSH"
|
|
28
|
+
camp coordination and camp management: "CCM"
|
|
29
|
+
camp coordination camp management: "CCM"
|
|
30
|
+
cash: "Cash"
|
|
31
|
+
cccm: "CCM"
|
|
32
|
+
ccs: "CCM"
|
|
33
|
+
cluster coordination: "CCM"
|
|
34
|
+
coord services support: "CCM"
|
|
35
|
+
coordinacion informacion: "CCM"
|
|
36
|
+
coordination: "CCM"
|
|
37
|
+
coordination et gestion des camps: "CCM"
|
|
38
|
+
eah: "WSH"
|
|
39
|
+
eau: "WSH"
|
|
40
|
+
eau assainissement et hygiene: "WSH"
|
|
41
|
+
eau hygiene: "WSH"
|
|
42
|
+
eau hygiene assainissement: "WSH"
|
|
43
|
+
eau hygiene et assainissement: "WSH"
|
|
44
|
+
educacion: "EDU"
|
|
45
|
+
educacion en emergencias: "EDU"
|
|
46
|
+
education: "EDU"
|
|
47
|
+
eha: "WSH"
|
|
48
|
+
emergency shelter and non food items: "SHL"
|
|
49
|
+
epah: "WSH"
|
|
50
|
+
erl: "ERY"
|
|
51
|
+
esnfi: "SHL"
|
|
52
|
+
explosive hazards: "PRO-MIN"
|
|
53
|
+
food: "FSC"
|
|
54
|
+
food security and agriculture: "FSC"
|
|
55
|
+
food security and livelihoods: "FSC"
|
|
56
|
+
food security and nutrition: "FSC"
|
|
57
|
+
food security livelihood: "FSC"
|
|
58
|
+
fsl: "FSC"
|
|
59
|
+
gestion des sites daccueil temporaires: "SHL"
|
|
60
|
+
gbv: "PRO-GBV"
|
|
61
|
+
hlp: "PRO-HLP"
|
|
62
|
+
humanitaire: "Hum"
|
|
63
|
+
hygiene: "WSH"
|
|
64
|
+
hygiene assainissement: "WSH"
|
|
65
|
+
intercluster: "Multi" # From Somalia 3W, hopefully not to be confused with intersectoral
|
|
66
|
+
logement terre et biens: "PRO-HLP"
|
|
67
|
+
logistica: "LOG"
|
|
68
|
+
logistique: "LOG"
|
|
69
|
+
manejo y gestion de campamentos: "CCM"
|
|
70
|
+
ms: "Multi"
|
|
71
|
+
multi secteur: "Multi"
|
|
72
|
+
multisectoriel: "Multi"
|
|
73
|
+
nutricion: "NUT"
|
|
74
|
+
nutrition: "NUT"
|
|
75
|
+
operatioanl presence water sanitation hygiene: "WSH"
|
|
76
|
+
operational presence education in emergencies: "EDU"
|
|
77
|
+
operational presence emergency shelter non food items: "SHL"
|
|
78
|
+
operational presence food security agriculture: "FSC"
|
|
79
|
+
operational presence health: "HEA"
|
|
80
|
+
operational presence nutrition: "NUT"
|
|
81
|
+
operational presence protection: "PRO"
|
|
82
|
+
pro cpm: "PRO-CPN"
|
|
83
|
+
pronna: "PRO-CPN"
|
|
84
|
+
propg: "PRO"
|
|
85
|
+
proteccion infantil: "PRO-CPN"
|
|
86
|
+
protection: "PRO"
|
|
87
|
+
protection de lenfance: "PRO-CPN"
|
|
88
|
+
protection de lenfant: "PRO-CPN"
|
|
89
|
+
protection generale: "PRO"
|
|
90
|
+
protection logement terre et propriete: "PRO-HLP"
|
|
91
|
+
protection ltb: "PRO-HLP"
|
|
92
|
+
protection lutte anti mines: "PRO-MIN"
|
|
93
|
+
protection pe: "PRO-CPN"
|
|
94
|
+
protection protection de lenfant: "PRO-CPN"
|
|
95
|
+
protection violences basees sur le genre: "PRO-GBV"
|
|
96
|
+
protection vgb: "PRO-GBV"
|
|
97
|
+
proteccion: "PRO"
|
|
98
|
+
provbg: "PRO-GBV"
|
|
99
|
+
psea: "PRO-GBV"
|
|
100
|
+
rapid response mechanism: "ERY"
|
|
101
|
+
rcf: "CCM"
|
|
102
|
+
rcf education: "EDU"
|
|
103
|
+
rcf food security and livelihoods: "FSC"
|
|
104
|
+
rcf health and nutrtion: "HEA"
|
|
105
|
+
rcf protection: "PRO"
|
|
106
|
+
recuperacion temprana: "ERY"
|
|
107
|
+
relevement precoce: "ERY"
|
|
108
|
+
relevement rapide: "ERY"
|
|
109
|
+
refugee response: "CCM"
|
|
110
|
+
refugees migrants multi sector: "CCM"
|
|
111
|
+
reponse aux refugies: "CCM"
|
|
112
|
+
sa: "FSC"
|
|
113
|
+
sal: "HEA"
|
|
114
|
+
salud: "HEA"
|
|
115
|
+
samv: "FSC"
|
|
116
|
+
sante: "HEA"
|
|
117
|
+
securite alimentaire: "FSC"
|
|
118
|
+
seguridad alimentaria: "FSC"
|
|
119
|
+
seguridad alimentaria y nutricion: "FSC"
|
|
120
|
+
services humanitaires communs: "Hum"
|
|
121
|
+
sexual and reproductive health: "HEA"
|
|
122
|
+
shelter: "SHL"
|
|
123
|
+
shelter nfi: "SHL"
|
|
124
|
+
shelter nfis: "SHL"
|
|
125
|
+
shelter and nfi: "SHL"
|
|
126
|
+
shelter and nfis: "SHL"
|
|
127
|
+
shelter and non food items: "SHL"
|
|
128
|
+
site management: "CCM"
|
|
129
|
+
snfi: "SHL"
|
|
130
|
+
telecommunications: "TEL"
|
|
131
|
+
telecommunications durgence: "TEL"
|
|
132
|
+
telecomunicaciones de emergencia: "TEL"
|
|
133
|
+
vbg: "PRO-GBV"
|
|
134
|
+
violences basees sur le genre: "PRO-GBV"
|
|
135
|
+
violence basee sur le genre: "PRO-GBV"
|
|
136
|
+
violencia basada en genero: "PRO-GBV"
|
|
137
|
+
wash: "WSH"
|
|
138
|
+
water sanitation and hygiene: "WSH"
|
|
@@ -282,9 +282,9 @@ class Sources:
|
|
|
282
282
|
if no_sources:
|
|
283
283
|
source_configuration["no_sources"] = True
|
|
284
284
|
return source_configuration
|
|
285
|
-
source_configuration[
|
|
286
|
-
|
|
287
|
-
|
|
285
|
+
source_configuration["should_overwrite_sources"] = (
|
|
286
|
+
should_overwrite_sources
|
|
287
|
+
)
|
|
288
288
|
if suffix_attribute:
|
|
289
289
|
source_configuration["suffix_attribute"] = suffix_attribute
|
|
290
290
|
return source_configuration
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: hdx-python-scraper
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.2
|
|
4
4
|
Summary: HDX Python scraper utilities to assemble data from multiple sources
|
|
5
5
|
Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
|
|
6
6
|
Author-email: Michael Rans <rans@email.com>
|
|
@@ -26,13 +26,14 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Requires-Python: >=3.8
|
|
28
28
|
Requires-Dist: gspread
|
|
29
|
-
Requires-Dist: hdx-python-api>=6.
|
|
30
|
-
Requires-Dist: hdx-python-country>=3.6
|
|
29
|
+
Requires-Dist: hdx-python-api>=6.3.7
|
|
30
|
+
Requires-Dist: hdx-python-country>=3.8.6
|
|
31
|
+
Requires-Dist: hdx-python-utilities>=3.8.2
|
|
31
32
|
Requires-Dist: regex
|
|
32
33
|
Provides-Extra: dev
|
|
33
34
|
Requires-Dist: pre-commit; extra == 'dev'
|
|
34
35
|
Provides-Extra: pandas
|
|
35
|
-
Requires-Dist: pandas>=2.
|
|
36
|
+
Requires-Dist: pandas>=2.2.2; extra == 'pandas'
|
|
36
37
|
Provides-Extra: test
|
|
37
38
|
Requires-Dist: pytest; extra == 'test'
|
|
38
39
|
Requires-Dist: pytest-cov; extra == 'test'
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
hdx/scraper/framework/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
|
|
2
|
+
hdx/scraper/framework/_version.py,sha256=qrwMUvCUqANtlUPbnE5wPCDZujNKWYOaJRJsJky27Ac,411
|
|
3
|
+
hdx/scraper/framework/base_scraper.py,sha256=vvwljQ5QWr6hpCjOS89RG1pvC955aLoPvm6pSovO75o,15432
|
|
4
|
+
hdx/scraper/framework/runner.py,sha256=GFnZM9HciZFibwwRgDHVk9F_y2n27ctpRwyeD1_ZcKw,53538
|
|
5
|
+
hdx/scraper/framework/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
hdx/scraper/framework/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
|
|
7
|
+
hdx/scraper/framework/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
|
|
8
|
+
hdx/scraper/framework/outputs/googlesheets.py,sha256=jLAfXz4usmLFrePxRIsMflxKPzSGv9T3jlMpSV-s4II,3087
|
|
9
|
+
hdx/scraper/framework/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
|
|
10
|
+
hdx/scraper/framework/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
hdx/scraper/framework/scrapers/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
|
|
12
|
+
hdx/scraper/framework/scrapers/configurable_scraper.py,sha256=PYPtU9XZALNx-2Jr8a8kVVDsT2j9yGgBaw6wXhztQIM,20612
|
|
13
|
+
hdx/scraper/framework/scrapers/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
|
|
14
|
+
hdx/scraper/framework/scrapers/rowparser.py,sha256=bH05JUqViIVes9T7gWp0D2778BlFiJuNHmdovSFdFoI,15614
|
|
15
|
+
hdx/scraper/framework/scrapers/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
|
|
16
|
+
hdx/scraper/framework/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
|
|
17
|
+
hdx/scraper/framework/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
|
|
18
|
+
hdx/scraper/framework/utilities/reader.py,sha256=pQcGg5TIhl3c-QX_F1sZxY4Ar0N7TLalX38IMuCXA-0,26568
|
|
19
|
+
hdx/scraper/framework/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
|
|
20
|
+
hdx/scraper/framework/utilities/sector.py,sha256=rl_TceRYc5YRoLccr0ABCM42ZLLtLzezWWWQ5YtbQDE,1947
|
|
21
|
+
hdx/scraper/framework/utilities/sector_configuration.yaml,sha256=LAUR5xfLU5qua5qtc3TcwEei0sD1zoCb_vfAxD7Grb8,3894
|
|
22
|
+
hdx/scraper/framework/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
|
|
23
|
+
hdx/scraper/framework/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
|
|
24
|
+
hdx_python_scraper-2.5.2.dist-info/METADATA,sha256=zNHR55fmxnxl_0K3u7zAl1rz1molJ7DX6meBJmK49Es,3361
|
|
25
|
+
hdx_python_scraper-2.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
26
|
+
hdx_python_scraper-2.5.2.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
|
|
27
|
+
hdx_python_scraper-2.5.2.dist-info/RECORD,,
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
|
|
2
|
-
hdx/scraper/_version.py,sha256=-yOUI-ZIjXgov3YpdPKmW_w-fIBrZtGytjk8Bz_DwDI,411
|
|
3
|
-
hdx/scraper/base_scraper.py,sha256=oo9oMqCUpK8_hPwcTz2PAKabzoyU0BQu5dgWgsFa55Y,15431
|
|
4
|
-
hdx/scraper/runner.py,sha256=3UoVi5jVRcex0U8gf1TTBLXGxisRPmCMSV8jUYHWZZM,52750
|
|
5
|
-
hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
|
|
7
|
-
hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
|
|
8
|
-
hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
|
|
9
|
-
hdx/scraper/configurable/scraper.py,sha256=4f4kNbG0HCIfPe1ft93T247s841rk1fP4cIpkFQ6NWU,20594
|
|
10
|
-
hdx/scraper/configurable/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
|
|
11
|
-
hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
|
|
13
|
-
hdx/scraper/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
|
|
14
|
-
hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1Ep39QY,3087
|
|
15
|
-
hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
|
|
16
|
-
hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
|
|
17
|
-
hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
|
|
18
|
-
hdx/scraper/utilities/reader.py,sha256=hexLIJW3CdP4DmobqMM-Z2d6pgcCs1zWWBW-stqoeNU,22975
|
|
19
|
-
hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
|
|
20
|
-
hdx/scraper/utilities/sources.py,sha256=VNhFYSUM2xeDlN6y4Ya9_0BskjPtjwQZmCKnQgpOemQ,11511
|
|
21
|
-
hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
|
|
22
|
-
hdx_python_scraper-2.3.4.dist-info/METADATA,sha256=fCv1Y7-m0IgaLUhfNddwjCPEnl7tOheLDntDhngefQc,3318
|
|
23
|
-
hdx_python_scraper-2.3.4.dist-info/WHEEL,sha256=TJPnKdtrSue7xZ_AVGkp9YXcvDrobsjBds1du3Nx6dc,87
|
|
24
|
-
hdx_python_scraper-2.3.4.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
|
|
25
|
-
hdx_python_scraper-2.3.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|