hdx-python-scraper 2.3.5__py3-none-any.whl → 2.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hdx/scraper/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2.3.5'
16
- __version_tuple__ = version_tuple = (2, 3, 5)
15
+ __version__ = version = '2.3.7'
16
+ __version_tuple__ = version_tuple = (2, 3, 7)
@@ -141,9 +141,9 @@ class BaseScraper(ABC):
141
141
  "should_overwrite_sources"
142
142
  )
143
143
  if should_overwrite_sources is not None:
144
- self.source_configuration[
145
- "should_overwrite_sources"
146
- ] = should_overwrite_sources
144
+ self.source_configuration["should_overwrite_sources"] = (
145
+ should_overwrite_sources
146
+ )
147
147
  source = self.datasetinfo["source"]
148
148
  if isinstance(source, str):
149
149
  source = {"default_source": source}
@@ -1,3 +1,4 @@
1
+ import glob
1
2
  import logging
2
3
  from datetime import datetime
3
4
  from os.path import join
@@ -10,6 +11,7 @@ from slugify import slugify
10
11
 
11
12
  from . import get_startend_dates_from_time_period, match_template
12
13
  from .sources import Sources
14
+ from hdx.api.configuration import Configuration
13
15
  from hdx.data.dataset import Dataset
14
16
  from hdx.data.resource import Resource
15
17
  from hdx.utilities.dateparse import parse_date
@@ -238,11 +240,14 @@ class Read(Retrieve):
238
240
  **kwargs,
239
241
  )
240
242
 
241
- def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
243
+ def read_dataset(
244
+ self, dataset_name: str, configuration: Optional[Configuration] = None
245
+ ) -> Optional[Dataset]:
242
246
  """Read HDX dataset
243
247
 
244
248
  Args:
245
249
  dataset_name (str): Dataset name
250
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
246
251
 
247
252
  Returns:
248
253
  Optional[Dataset]: The dataset that was read or None
@@ -252,7 +257,7 @@ class Read(Retrieve):
252
257
  logger.info(f"Using saved dataset {dataset_name} in {saved_path}")
253
258
  dataset = Dataset.load_from_json(saved_path)
254
259
  else:
255
- dataset = Dataset.read_from_hdx(dataset_name)
260
+ dataset = Dataset.read_from_hdx(dataset_name, configuration)
256
261
  if self.save:
257
262
  logger.info(f"Saving dataset {dataset_name} in {saved_path}")
258
263
  if dataset is None:
@@ -261,6 +266,56 @@ class Read(Retrieve):
261
266
  dataset.save_to_json(saved_path, follow_urls=True)
262
267
  return dataset
263
268
 
269
+ def search_datasets(
270
+ self,
271
+ filename: str,
272
+ query: Optional[str] = "*:*",
273
+ configuration: Optional[Configuration] = None,
274
+ page_size: int = 1000,
275
+ **kwargs: Any,
276
+ ) -> List[Dataset]:
277
+ """Read HDX dataset
278
+
279
+ Args:
280
+ filename (str): Filename for saved files. Will be prefixed by underscore and a number.
281
+ query (Optional[str]): Query (in Solr format). Defaults to '*:*'.
282
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
283
+ page_size (int): Size of page to return. Defaults to 1000.
284
+ **kwargs: See below
285
+ fq (string): Any filter queries to apply
286
+ rows (int): Number of matching rows to return. Defaults to all datasets (sys.maxsize).
287
+ start (int): Offset in the complete result for where the set of returned datasets should begin
288
+ sort (string): Sorting of results. Defaults to 'relevance asc, metadata_modified desc' if rows<=page_size or 'metadata_modified asc' if rows>page_size.
289
+ facet (string): Whether to enable faceted results. Default to True.
290
+ facet.mincount (int): Minimum counts for facet fields should be included in the results
291
+ facet.limit (int): Maximum number of values the facet fields return (- = unlimited). Defaults to 50.
292
+ facet.field (List[str]): Fields to facet upon. Default is empty.
293
+ use_default_schema (bool): Use default package schema instead of custom schema. Defaults to False.
294
+
295
+ Returns:
296
+ List[Dataset]: list of datasets resulting from query
297
+ """
298
+
299
+ saved_path = join(self.saved_dir, filename)
300
+ if self.use_saved:
301
+ logger.info(
302
+ f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
303
+ )
304
+ datasets = []
305
+ for file_path in glob.glob(f"{saved_path}_*.json"):
306
+ datasets.append(Dataset.load_from_json(file_path))
307
+ else:
308
+ datasets = Dataset.search_in_hdx(
309
+ query, configuration, page_size, **kwargs
310
+ )
311
+ if self.save:
312
+ for i, dataset in enumerate(datasets):
313
+ file_path = f"{saved_path}_{i}.json"
314
+ name = dataset["name"]
315
+ logger.info(f"Saving dataset {name} in {file_path}")
316
+ dataset.save_to_json(file_path, follow_urls=True)
317
+ return datasets
318
+
264
319
  @staticmethod
265
320
  def construct_filename(name: str, format: str):
266
321
  """Construct filename from name and format. The filename of the file
@@ -438,7 +493,10 @@ class Read(Retrieve):
438
493
  return self.hxl_info_file(name, format, url, **kwargs)
439
494
 
440
495
  def read_hdx_metadata(
441
- self, datasetinfo: Dict, do_resource_check: bool = True
496
+ self,
497
+ datasetinfo: Dict,
498
+ do_resource_check: bool = True,
499
+ configuration: Optional[Configuration] = None,
442
500
  ) -> Optional[Resource]:
443
501
  """Read metadata from HDX dataset and add to input dictionary. If url
444
502
  is not supplied, will look through resources for one that matches
@@ -454,13 +512,14 @@ class Read(Retrieve):
454
512
  Args:
455
513
  datasetinfo (Dict): Dictionary of information about dataset
456
514
  do_resource_check (bool): Whether to check resources. Defaults to False.
515
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
457
516
 
458
517
  Returns:
459
518
  Optional[Resource]: The resource if a url was not given
460
519
  """
461
520
  dataset_nameinfo = datasetinfo["dataset"]
462
521
  if isinstance(dataset_nameinfo, str):
463
- dataset = self.read_dataset(dataset_nameinfo)
522
+ dataset = self.read_dataset(dataset_nameinfo, configuration)
464
523
  resource = None
465
524
  url = datasetinfo.get("url")
466
525
  resource_name = datasetinfo.get("resource")
@@ -491,24 +550,24 @@ class Read(Retrieve):
491
550
  else:
492
551
  url = resource["url"] # otherwise set the url key in
493
552
  # datasetinfo to the resource url (by setting url here)
494
- datasetinfo[
495
- "hapi_resource_metadata"
496
- ] = self.get_hapi_resource_metadata(resource)
553
+ datasetinfo["hapi_resource_metadata"] = (
554
+ self.get_hapi_resource_metadata(resource)
555
+ )
497
556
  datasetinfo["url"] = url
498
557
  if "source_date" not in datasetinfo:
499
- datasetinfo[
500
- "source_date"
501
- ] = get_startend_dates_from_time_period(
502
- dataset, today=self.today
558
+ datasetinfo["source_date"] = (
559
+ get_startend_dates_from_time_period(
560
+ dataset, today=self.today
561
+ )
503
562
  )
504
563
  if "source" not in datasetinfo:
505
564
  datasetinfo["source"] = dataset["dataset_source"]
506
565
  if "source_url" not in datasetinfo:
507
566
  datasetinfo["source_url"] = dataset.get_hdx_url()
508
567
  Sources.standardise_datasetinfo_source_date(datasetinfo)
509
- datasetinfo[
510
- "hapi_dataset_metadata"
511
- ] = self.get_hapi_dataset_metadata(dataset, datasetinfo)
568
+ datasetinfo["hapi_dataset_metadata"] = (
569
+ self.get_hapi_dataset_metadata(dataset, datasetinfo)
570
+ )
512
571
  return resource
513
572
 
514
573
  if "source_date" not in datasetinfo:
@@ -527,7 +586,7 @@ class Read(Retrieve):
527
586
  for hxltag, dataset_name in dataset_nameinfo.items():
528
587
  dataset = datasets.get(dataset_name)
529
588
  if not dataset:
530
- dataset = self.read_dataset(dataset_name)
589
+ dataset = self.read_dataset(dataset_name, configuration)
531
590
  datasets[dataset_name] = dataset
532
591
  if source_date is not None:
533
592
  if hxltag == "default_dataset":
@@ -561,18 +620,22 @@ class Read(Retrieve):
561
620
  def read_hdx(
562
621
  self,
563
622
  datasetinfo: Dict,
623
+ configuration: Optional[Configuration] = None,
564
624
  **kwargs: Any,
565
625
  ) -> Tuple[List[str], Iterator[Dict]]:
566
626
  """Read data and metadata from HDX dataset
567
627
 
568
628
  Args:
569
629
  datasetinfo (Dict): Dictionary of information about dataset
630
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
570
631
  **kwargs: Parameters to pass to download_file call
571
632
 
572
633
  Returns:
573
634
  Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
574
635
  """
575
- resource = self.read_hdx_metadata(datasetinfo)
636
+ resource = self.read_hdx_metadata(
637
+ datasetinfo, configuration=configuration
638
+ )
576
639
  filename = kwargs.get("filename")
577
640
  if filename:
578
641
  del kwargs["filename"]
@@ -593,12 +656,14 @@ class Read(Retrieve):
593
656
  def read(
594
657
  self,
595
658
  datasetinfo: Dict,
659
+ configuration: Optional[Configuration] = None,
596
660
  **kwargs: Any,
597
661
  ) -> Tuple[List[str], Iterator[Dict]]:
598
662
  """Read data and metadata from HDX dataset
599
663
 
600
664
  Args:
601
665
  datasetinfo (Dict): Dictionary of information about dataset
666
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
602
667
  **kwargs: Parameters to pass to download_file call
603
668
 
604
669
  Returns:
@@ -607,7 +672,9 @@ class Read(Retrieve):
607
672
  format = datasetinfo["format"]
608
673
  if format in ["json", "csv", "xls", "xlsx"]:
609
674
  if "dataset" in datasetinfo:
610
- headers, iterator = self.read_hdx(datasetinfo, **kwargs)
675
+ headers, iterator = self.read_hdx(
676
+ datasetinfo, configuration, **kwargs
677
+ )
611
678
  else:
612
679
  headers, iterator = self.read_tabular(datasetinfo, **kwargs)
613
680
  else:
@@ -282,9 +282,9 @@ class Sources:
282
282
  if no_sources:
283
283
  source_configuration["no_sources"] = True
284
284
  return source_configuration
285
- source_configuration[
286
- "should_overwrite_sources"
287
- ] = should_overwrite_sources
285
+ source_configuration["should_overwrite_sources"] = (
286
+ should_overwrite_sources
287
+ )
288
288
  if suffix_attribute:
289
289
  source_configuration["suffix_attribute"] = suffix_attribute
290
290
  return source_configuration
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: hdx-python-scraper
3
- Version: 2.3.5
3
+ Version: 2.3.7
4
4
  Summary: HDX Python scraper utilities to assemble data from multiple sources
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -26,8 +26,9 @@ Classifier: Programming Language :: Python :: 3.12
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Requires-Python: >=3.8
28
28
  Requires-Dist: gspread
29
- Requires-Dist: hdx-python-api>=6.2.1
30
- Requires-Dist: hdx-python-country>=3.6.4
29
+ Requires-Dist: hdx-python-api>=6.2.8
30
+ Requires-Dist: hdx-python-country>=3.7.0
31
+ Requires-Dist: hdx-python-utilities>=3.6.8
31
32
  Requires-Dist: regex
32
33
  Provides-Extra: dev
33
34
  Requires-Dist: pre-commit; extra == 'dev'
@@ -1,6 +1,6 @@
1
1
  hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
- hdx/scraper/_version.py,sha256=-9aYLvgAp04zL8yFAMPjvf6kLKgqW1mLgyuk6XA3LcE,411
3
- hdx/scraper/base_scraper.py,sha256=oo9oMqCUpK8_hPwcTz2PAKabzoyU0BQu5dgWgsFa55Y,15431
2
+ hdx/scraper/_version.py,sha256=SH_yCAX65tCK8PRP8gyPvUcp4HPVksM4fKEz1rXjzjM,411
3
+ hdx/scraper/base_scraper.py,sha256=2eJifpb8G_KtEb9Z273suDCiMPteJsCBHwDEk3o0wA8,15433
4
4
  hdx/scraper/runner.py,sha256=v5ToiTBOvFbkMOcBAoWGmDyO5bhGooTL8pPIt3BIQ8Y,53550
5
5
  hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
15
15
  hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
16
16
  hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
17
17
  hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
- hdx/scraper/utilities/reader.py,sha256=hexLIJW3CdP4DmobqMM-Z2d6pgcCs1zWWBW-stqoeNU,22975
18
+ hdx/scraper/utilities/reader.py,sha256=03S53U1GylPaeRoqEj3TT5UgiKTwVODUx3IETwCb9ps,26364
19
19
  hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
- hdx/scraper/utilities/sources.py,sha256=VNhFYSUM2xeDlN6y4Ya9_0BskjPtjwQZmCKnQgpOemQ,11511
20
+ hdx/scraper/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
21
21
  hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
22
- hdx_python_scraper-2.3.5.dist-info/METADATA,sha256=jYBTVEB111S1R3Cj8fZByzM4E3nRRKCr31bsCPstjPA,3318
23
- hdx_python_scraper-2.3.5.dist-info/WHEEL,sha256=TJPnKdtrSue7xZ_AVGkp9YXcvDrobsjBds1du3Nx6dc,87
24
- hdx_python_scraper-2.3.5.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
- hdx_python_scraper-2.3.5.dist-info/RECORD,,
22
+ hdx_python_scraper-2.3.7.dist-info/METADATA,sha256=Nw-xgPumG7UzJw3M1D5G9kZeUgZObM3m8mkkA1kutqg,3361
23
+ hdx_python_scraper-2.3.7.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
24
+ hdx_python_scraper-2.3.7.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
+ hdx_python_scraper-2.3.7.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.21.1
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any