hdx-python-scraper 2.3.5__py3-none-any.whl → 2.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdx/scraper/_version.py +2 -2
- hdx/scraper/base_scraper.py +3 -3
- hdx/scraper/utilities/reader.py +84 -17
- hdx/scraper/utilities/sources.py +3 -3
- {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.3.7.dist-info}/METADATA +5 -4
- {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.3.7.dist-info}/RECORD +8 -8
- {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.3.7.dist-info}/WHEEL +1 -1
- {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.3.7.dist-info}/licenses/LICENSE +0 -0
hdx/scraper/_version.py
CHANGED
hdx/scraper/base_scraper.py
CHANGED
|
@@ -141,9 +141,9 @@ class BaseScraper(ABC):
|
|
|
141
141
|
"should_overwrite_sources"
|
|
142
142
|
)
|
|
143
143
|
if should_overwrite_sources is not None:
|
|
144
|
-
self.source_configuration[
|
|
145
|
-
|
|
146
|
-
|
|
144
|
+
self.source_configuration["should_overwrite_sources"] = (
|
|
145
|
+
should_overwrite_sources
|
|
146
|
+
)
|
|
147
147
|
source = self.datasetinfo["source"]
|
|
148
148
|
if isinstance(source, str):
|
|
149
149
|
source = {"default_source": source}
|
hdx/scraper/utilities/reader.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import logging
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from os.path import join
|
|
@@ -10,6 +11,7 @@ from slugify import slugify
|
|
|
10
11
|
|
|
11
12
|
from . import get_startend_dates_from_time_period, match_template
|
|
12
13
|
from .sources import Sources
|
|
14
|
+
from hdx.api.configuration import Configuration
|
|
13
15
|
from hdx.data.dataset import Dataset
|
|
14
16
|
from hdx.data.resource import Resource
|
|
15
17
|
from hdx.utilities.dateparse import parse_date
|
|
@@ -238,11 +240,14 @@ class Read(Retrieve):
|
|
|
238
240
|
**kwargs,
|
|
239
241
|
)
|
|
240
242
|
|
|
241
|
-
def read_dataset(
|
|
243
|
+
def read_dataset(
|
|
244
|
+
self, dataset_name: str, configuration: Optional[Configuration] = None
|
|
245
|
+
) -> Optional[Dataset]:
|
|
242
246
|
"""Read HDX dataset
|
|
243
247
|
|
|
244
248
|
Args:
|
|
245
249
|
dataset_name (str): Dataset name
|
|
250
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
246
251
|
|
|
247
252
|
Returns:
|
|
248
253
|
Optional[Dataset]: The dataset that was read or None
|
|
@@ -252,7 +257,7 @@ class Read(Retrieve):
|
|
|
252
257
|
logger.info(f"Using saved dataset {dataset_name} in {saved_path}")
|
|
253
258
|
dataset = Dataset.load_from_json(saved_path)
|
|
254
259
|
else:
|
|
255
|
-
dataset = Dataset.read_from_hdx(dataset_name)
|
|
260
|
+
dataset = Dataset.read_from_hdx(dataset_name, configuration)
|
|
256
261
|
if self.save:
|
|
257
262
|
logger.info(f"Saving dataset {dataset_name} in {saved_path}")
|
|
258
263
|
if dataset is None:
|
|
@@ -261,6 +266,56 @@ class Read(Retrieve):
|
|
|
261
266
|
dataset.save_to_json(saved_path, follow_urls=True)
|
|
262
267
|
return dataset
|
|
263
268
|
|
|
269
|
+
def search_datasets(
|
|
270
|
+
self,
|
|
271
|
+
filename: str,
|
|
272
|
+
query: Optional[str] = "*:*",
|
|
273
|
+
configuration: Optional[Configuration] = None,
|
|
274
|
+
page_size: int = 1000,
|
|
275
|
+
**kwargs: Any,
|
|
276
|
+
) -> List[Dataset]:
|
|
277
|
+
"""Read HDX dataset
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
filename (str): Filename for saved files. Will be prefixed by underscore and a number.
|
|
281
|
+
query (Optional[str]): Query (in Solr format). Defaults to '*:*'.
|
|
282
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
283
|
+
page_size (int): Size of page to return. Defaults to 1000.
|
|
284
|
+
**kwargs: See below
|
|
285
|
+
fq (string): Any filter queries to apply
|
|
286
|
+
rows (int): Number of matching rows to return. Defaults to all datasets (sys.maxsize).
|
|
287
|
+
start (int): Offset in the complete result for where the set of returned datasets should begin
|
|
288
|
+
sort (string): Sorting of results. Defaults to 'relevance asc, metadata_modified desc' if rows<=page_size or 'metadata_modified asc' if rows>page_size.
|
|
289
|
+
facet (string): Whether to enable faceted results. Default to True.
|
|
290
|
+
facet.mincount (int): Minimum counts for facet fields should be included in the results
|
|
291
|
+
facet.limit (int): Maximum number of values the facet fields return (- = unlimited). Defaults to 50.
|
|
292
|
+
facet.field (List[str]): Fields to facet upon. Default is empty.
|
|
293
|
+
use_default_schema (bool): Use default package schema instead of custom schema. Defaults to False.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
List[Dataset]: list of datasets resulting from query
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
saved_path = join(self.saved_dir, filename)
|
|
300
|
+
if self.use_saved:
|
|
301
|
+
logger.info(
|
|
302
|
+
f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
|
|
303
|
+
)
|
|
304
|
+
datasets = []
|
|
305
|
+
for file_path in glob.glob(f"{saved_path}_*.json"):
|
|
306
|
+
datasets.append(Dataset.load_from_json(file_path))
|
|
307
|
+
else:
|
|
308
|
+
datasets = Dataset.search_in_hdx(
|
|
309
|
+
query, configuration, page_size, **kwargs
|
|
310
|
+
)
|
|
311
|
+
if self.save:
|
|
312
|
+
for i, dataset in enumerate(datasets):
|
|
313
|
+
file_path = f"{saved_path}_{i}.json"
|
|
314
|
+
name = dataset["name"]
|
|
315
|
+
logger.info(f"Saving dataset {name} in {file_path}")
|
|
316
|
+
dataset.save_to_json(file_path, follow_urls=True)
|
|
317
|
+
return datasets
|
|
318
|
+
|
|
264
319
|
@staticmethod
|
|
265
320
|
def construct_filename(name: str, format: str):
|
|
266
321
|
"""Construct filename from name and format. The filename of the file
|
|
@@ -438,7 +493,10 @@ class Read(Retrieve):
|
|
|
438
493
|
return self.hxl_info_file(name, format, url, **kwargs)
|
|
439
494
|
|
|
440
495
|
def read_hdx_metadata(
|
|
441
|
-
self,
|
|
496
|
+
self,
|
|
497
|
+
datasetinfo: Dict,
|
|
498
|
+
do_resource_check: bool = True,
|
|
499
|
+
configuration: Optional[Configuration] = None,
|
|
442
500
|
) -> Optional[Resource]:
|
|
443
501
|
"""Read metadata from HDX dataset and add to input dictionary. If url
|
|
444
502
|
is not supplied, will look through resources for one that matches
|
|
@@ -454,13 +512,14 @@ class Read(Retrieve):
|
|
|
454
512
|
Args:
|
|
455
513
|
datasetinfo (Dict): Dictionary of information about dataset
|
|
456
514
|
do_resource_check (bool): Whether to check resources. Defaults to False.
|
|
515
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
457
516
|
|
|
458
517
|
Returns:
|
|
459
518
|
Optional[Resource]: The resource if a url was not given
|
|
460
519
|
"""
|
|
461
520
|
dataset_nameinfo = datasetinfo["dataset"]
|
|
462
521
|
if isinstance(dataset_nameinfo, str):
|
|
463
|
-
dataset = self.read_dataset(dataset_nameinfo)
|
|
522
|
+
dataset = self.read_dataset(dataset_nameinfo, configuration)
|
|
464
523
|
resource = None
|
|
465
524
|
url = datasetinfo.get("url")
|
|
466
525
|
resource_name = datasetinfo.get("resource")
|
|
@@ -491,24 +550,24 @@ class Read(Retrieve):
|
|
|
491
550
|
else:
|
|
492
551
|
url = resource["url"] # otherwise set the url key in
|
|
493
552
|
# datasetinfo to the resource url (by setting url here)
|
|
494
|
-
datasetinfo[
|
|
495
|
-
|
|
496
|
-
|
|
553
|
+
datasetinfo["hapi_resource_metadata"] = (
|
|
554
|
+
self.get_hapi_resource_metadata(resource)
|
|
555
|
+
)
|
|
497
556
|
datasetinfo["url"] = url
|
|
498
557
|
if "source_date" not in datasetinfo:
|
|
499
|
-
datasetinfo[
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
558
|
+
datasetinfo["source_date"] = (
|
|
559
|
+
get_startend_dates_from_time_period(
|
|
560
|
+
dataset, today=self.today
|
|
561
|
+
)
|
|
503
562
|
)
|
|
504
563
|
if "source" not in datasetinfo:
|
|
505
564
|
datasetinfo["source"] = dataset["dataset_source"]
|
|
506
565
|
if "source_url" not in datasetinfo:
|
|
507
566
|
datasetinfo["source_url"] = dataset.get_hdx_url()
|
|
508
567
|
Sources.standardise_datasetinfo_source_date(datasetinfo)
|
|
509
|
-
datasetinfo[
|
|
510
|
-
|
|
511
|
-
|
|
568
|
+
datasetinfo["hapi_dataset_metadata"] = (
|
|
569
|
+
self.get_hapi_dataset_metadata(dataset, datasetinfo)
|
|
570
|
+
)
|
|
512
571
|
return resource
|
|
513
572
|
|
|
514
573
|
if "source_date" not in datasetinfo:
|
|
@@ -527,7 +586,7 @@ class Read(Retrieve):
|
|
|
527
586
|
for hxltag, dataset_name in dataset_nameinfo.items():
|
|
528
587
|
dataset = datasets.get(dataset_name)
|
|
529
588
|
if not dataset:
|
|
530
|
-
dataset = self.read_dataset(dataset_name)
|
|
589
|
+
dataset = self.read_dataset(dataset_name, configuration)
|
|
531
590
|
datasets[dataset_name] = dataset
|
|
532
591
|
if source_date is not None:
|
|
533
592
|
if hxltag == "default_dataset":
|
|
@@ -561,18 +620,22 @@ class Read(Retrieve):
|
|
|
561
620
|
def read_hdx(
|
|
562
621
|
self,
|
|
563
622
|
datasetinfo: Dict,
|
|
623
|
+
configuration: Optional[Configuration] = None,
|
|
564
624
|
**kwargs: Any,
|
|
565
625
|
) -> Tuple[List[str], Iterator[Dict]]:
|
|
566
626
|
"""Read data and metadata from HDX dataset
|
|
567
627
|
|
|
568
628
|
Args:
|
|
569
629
|
datasetinfo (Dict): Dictionary of information about dataset
|
|
630
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
570
631
|
**kwargs: Parameters to pass to download_file call
|
|
571
632
|
|
|
572
633
|
Returns:
|
|
573
634
|
Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
|
|
574
635
|
"""
|
|
575
|
-
resource = self.read_hdx_metadata(
|
|
636
|
+
resource = self.read_hdx_metadata(
|
|
637
|
+
datasetinfo, configuration=configuration
|
|
638
|
+
)
|
|
576
639
|
filename = kwargs.get("filename")
|
|
577
640
|
if filename:
|
|
578
641
|
del kwargs["filename"]
|
|
@@ -593,12 +656,14 @@ class Read(Retrieve):
|
|
|
593
656
|
def read(
|
|
594
657
|
self,
|
|
595
658
|
datasetinfo: Dict,
|
|
659
|
+
configuration: Optional[Configuration] = None,
|
|
596
660
|
**kwargs: Any,
|
|
597
661
|
) -> Tuple[List[str], Iterator[Dict]]:
|
|
598
662
|
"""Read data and metadata from HDX dataset
|
|
599
663
|
|
|
600
664
|
Args:
|
|
601
665
|
datasetinfo (Dict): Dictionary of information about dataset
|
|
666
|
+
configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
|
|
602
667
|
**kwargs: Parameters to pass to download_file call
|
|
603
668
|
|
|
604
669
|
Returns:
|
|
@@ -607,7 +672,9 @@ class Read(Retrieve):
|
|
|
607
672
|
format = datasetinfo["format"]
|
|
608
673
|
if format in ["json", "csv", "xls", "xlsx"]:
|
|
609
674
|
if "dataset" in datasetinfo:
|
|
610
|
-
headers, iterator = self.read_hdx(
|
|
675
|
+
headers, iterator = self.read_hdx(
|
|
676
|
+
datasetinfo, configuration, **kwargs
|
|
677
|
+
)
|
|
611
678
|
else:
|
|
612
679
|
headers, iterator = self.read_tabular(datasetinfo, **kwargs)
|
|
613
680
|
else:
|
hdx/scraper/utilities/sources.py
CHANGED
|
@@ -282,9 +282,9 @@ class Sources:
|
|
|
282
282
|
if no_sources:
|
|
283
283
|
source_configuration["no_sources"] = True
|
|
284
284
|
return source_configuration
|
|
285
|
-
source_configuration[
|
|
286
|
-
|
|
287
|
-
|
|
285
|
+
source_configuration["should_overwrite_sources"] = (
|
|
286
|
+
should_overwrite_sources
|
|
287
|
+
)
|
|
288
288
|
if suffix_attribute:
|
|
289
289
|
source_configuration["suffix_attribute"] = suffix_attribute
|
|
290
290
|
return source_configuration
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: hdx-python-scraper
|
|
3
|
-
Version: 2.3.
|
|
3
|
+
Version: 2.3.7
|
|
4
4
|
Summary: HDX Python scraper utilities to assemble data from multiple sources
|
|
5
5
|
Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
|
|
6
6
|
Author-email: Michael Rans <rans@email.com>
|
|
@@ -26,8 +26,9 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Requires-Python: >=3.8
|
|
28
28
|
Requires-Dist: gspread
|
|
29
|
-
Requires-Dist: hdx-python-api>=6.2.
|
|
30
|
-
Requires-Dist: hdx-python-country>=3.
|
|
29
|
+
Requires-Dist: hdx-python-api>=6.2.8
|
|
30
|
+
Requires-Dist: hdx-python-country>=3.7.0
|
|
31
|
+
Requires-Dist: hdx-python-utilities>=3.6.8
|
|
31
32
|
Requires-Dist: regex
|
|
32
33
|
Provides-Extra: dev
|
|
33
34
|
Requires-Dist: pre-commit; extra == 'dev'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
|
|
2
|
-
hdx/scraper/_version.py,sha256
|
|
3
|
-
hdx/scraper/base_scraper.py,sha256=
|
|
2
|
+
hdx/scraper/_version.py,sha256=SH_yCAX65tCK8PRP8gyPvUcp4HPVksM4fKEz1rXjzjM,411
|
|
3
|
+
hdx/scraper/base_scraper.py,sha256=2eJifpb8G_KtEb9Z273suDCiMPteJsCBHwDEk3o0wA8,15433
|
|
4
4
|
hdx/scraper/runner.py,sha256=v5ToiTBOvFbkMOcBAoWGmDyO5bhGooTL8pPIt3BIQ8Y,53550
|
|
5
5
|
hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
|
|
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
|
|
|
15
15
|
hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
|
|
16
16
|
hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
|
|
17
17
|
hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
|
|
18
|
-
hdx/scraper/utilities/reader.py,sha256=
|
|
18
|
+
hdx/scraper/utilities/reader.py,sha256=03S53U1GylPaeRoqEj3TT5UgiKTwVODUx3IETwCb9ps,26364
|
|
19
19
|
hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
|
|
20
|
-
hdx/scraper/utilities/sources.py,sha256=
|
|
20
|
+
hdx/scraper/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
|
|
21
21
|
hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
|
|
22
|
-
hdx_python_scraper-2.3.
|
|
23
|
-
hdx_python_scraper-2.3.
|
|
24
|
-
hdx_python_scraper-2.3.
|
|
25
|
-
hdx_python_scraper-2.3.
|
|
22
|
+
hdx_python_scraper-2.3.7.dist-info/METADATA,sha256=Nw-xgPumG7UzJw3M1D5G9kZeUgZObM3m8mkkA1kutqg,3361
|
|
23
|
+
hdx_python_scraper-2.3.7.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
24
|
+
hdx_python_scraper-2.3.7.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
|
|
25
|
+
hdx_python_scraper-2.3.7.dist-info/RECORD,,
|
|
File without changes
|