hdx-python-utilities 3.7.1__tar.gz → 3.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/PKG-INFO +2 -1
  2. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/documentation/main.md +3 -0
  3. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/pyproject.toml +1 -0
  4. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/requirements.txt +13 -11
  5. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/_version.py +2 -2
  6. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/downloader.py +19 -0
  7. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/retriever.py +1 -1
  8. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/text.py +39 -0
  9. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_downloader.py +76 -0
  10. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_text.py +9 -0
  11. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/.config/coveragerc +0 -0
  12. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/.config/pre-commit-config.yaml +0 -0
  13. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/.config/pytest.ini +0 -0
  14. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/.config/ruff.toml +0 -0
  15. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/.github/workflows/publish.yaml +0 -0
  16. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/.github/workflows/run-python-tests.yaml +0 -0
  17. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/.gitignore +0 -0
  18. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/CONTRIBUTING.md +0 -0
  19. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/LICENSE +0 -0
  20. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/README.md +0 -0
  21. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/documentation/.readthedocs.yaml +0 -0
  22. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/documentation/pydoc-markdown.yaml +0 -0
  23. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/__init__.py +0 -0
  24. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/base_downloader.py +0 -0
  25. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/compare.py +0 -0
  26. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/dateparse.py +0 -0
  27. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/dictandlist.py +0 -0
  28. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/easy_logging.py +0 -0
  29. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/email.py +0 -0
  30. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/encoding.py +0 -0
  31. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/errors_onexit.py +0 -0
  32. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/frictionless_wrapper.py +0 -0
  33. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/html.py +0 -0
  34. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/loader.py +0 -0
  35. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/path.py +0 -0
  36. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/saver.py +0 -0
  37. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/session.py +0 -0
  38. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/state.py +0 -0
  39. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/typehint.py +0 -0
  40. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/useragent.py +0 -0
  41. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/src/hdx/utilities/uuid.py +0 -0
  42. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/compare/test_csv_processing.csv +0 -0
  43. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/compare/test_csv_processing2.csv +0 -0
  44. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/empty.yaml +0 -0
  45. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/hdx_config.json +0 -0
  46. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/hdx_config.yaml +0 -0
  47. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/hdx_email_configuration.json +0 -0
  48. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/hdx_email_configuration.yaml +0 -0
  49. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/json_csv.yaml +0 -0
  50. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/logging_config.json +0 -0
  51. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/logging_config.yaml +0 -0
  52. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/project_configuration.json +0 -0
  53. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/project_configuration.yaml +0 -0
  54. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/smtp_config.json +0 -0
  55. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/smtp_config.yaml +0 -0
  56. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/user_agent_config.yaml +0 -0
  57. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/user_agent_config2.yaml +0 -0
  58. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/user_agent_config3.yaml +0 -0
  59. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/config/user_agent_config_wrong.yaml +0 -0
  60. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/basicauth.txt +0 -0
  61. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/bearertoken.txt +0 -0
  62. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/extra_params.json +0 -0
  63. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/extra_params.yaml +0 -0
  64. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/extra_params_tree.yaml +0 -0
  65. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_csv_processing.csv +0 -0
  66. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_csv_processing_blanks.csv +0 -0
  67. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_data.csv +0 -0
  68. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_data.xlsx +0 -0
  69. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_data1.csv/empty.txt +0 -0
  70. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_data2.csv +0 -0
  71. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_json_processing.json +0 -0
  72. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_xls_processing.xls +0 -0
  73. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/downloader/test_xlsx_processing.xlsx +0 -0
  74. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/html/response.html +0 -0
  75. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/loader/empty.json +0 -0
  76. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/loader/empty.yaml +0 -0
  77. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/fallbacks/test.csv +0 -0
  78. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/fallbacks/test.json +0 -0
  79. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/fallbacks/test.txt +0 -0
  80. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/fallbacks/test.yaml +0 -0
  81. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/retriever-test.csv +0 -0
  82. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/test.csv +0 -0
  83. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/test.json +0 -0
  84. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/test.txt +0 -0
  85. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/test.yaml +0 -0
  86. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/retriever/test_hxl.csv +0 -0
  87. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out.csv +0 -0
  88. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out.json +0 -0
  89. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out2.csv +0 -0
  90. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out2.json +0 -0
  91. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out5.json +0 -0
  92. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out6.json +0 -0
  93. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out7.json +0 -0
  94. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out8.csv +0 -0
  95. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/out8.json +0 -0
  96. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/pretty-false_sortkeys-false.json +0 -0
  97. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/pretty-false_sortkeys-false.yaml +0 -0
  98. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/pretty-false_sortkeys-true.json +0 -0
  99. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/pretty-false_sortkeys-true.yaml +0 -0
  100. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/pretty-true_sortkeys-false.json +0 -0
  101. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/pretty-true_sortkeys-false.yaml +0 -0
  102. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/pretty-true_sortkeys-true.json +0 -0
  103. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/saver/pretty-true_sortkeys-true.yaml +0 -0
  104. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/state/analysis_dates.txt +0 -0
  105. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/state/last_build_date.txt +0 -0
  106. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/fixtures/test_data.csv +0 -0
  107. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/conftest.py +0 -0
  108. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_compare.py +0 -0
  109. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_dateparse.py +0 -0
  110. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_dictandlist.py +0 -0
  111. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_easy_logging.py +0 -0
  112. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_email.py +0 -0
  113. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_encoding.py +0 -0
  114. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_errors_onexit.py +0 -0
  115. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_html.py +0 -0
  116. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_loader.py +0 -0
  117. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_path.py +0 -0
  118. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_retriever.py +0 -0
  119. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_saver.py +0 -0
  120. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_state.py +0 -0
  121. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_useragent.py +0 -0
  122. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/test_uuid.py +0 -0
  123. {hdx_python_utilities-3.7.1 → hdx_python_utilities-3.7.3}/tests/hdx/utilities/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: hdx-python-utilities
3
- Version: 3.7.1
3
+ Version: 3.7.3
4
4
  Summary: HDX Python Utilities for streaming tabular data, date and time handling and other helpful functions
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-utilities
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -36,6 +36,7 @@ Requires-Dist: requests-file
36
36
  Requires-Dist: ruamel-yaml
37
37
  Requires-Dist: tableschema-to-template>=0.0.13
38
38
  Requires-Dist: xlrd>=2.0.1
39
+ Requires-Dist: xlsx2csv
39
40
  Requires-Dist: xlwt>=1.3.0
40
41
  Provides-Extra: dev
41
42
  Requires-Dist: pre-commit; extra == 'dev'
@@ -785,6 +785,9 @@ Examples:
785
785
 
786
786
  a = "The quick brown fox jumped over the lazy dog. It was so fast!"
787
787
 
788
+ # Normalise text eg. to support name matching
789
+ assert normalise("£^*& ()+-[]<>?|\ Al DhaleZ'eÉ / الضالع,,..1234''#~~### ") == "al dhalezee 1234"
790
+
788
791
  # Remove whitespace and punctuation from end of string
789
792
  assert remove_end_characters('lalala,.,"') == "lalala"
790
793
  assert remove_end_characters('lalala, .\t/,"', f"{punctuation}{whitespace}" == "lalala"
@@ -51,6 +51,7 @@ dependencies = [
51
51
  "ratelimit",
52
52
  "requests-file",
53
53
  "ruamel.yaml",
54
+ "xlsx2csv",
54
55
  ]
55
56
  dynamic = ["version"]
56
57
 
@@ -10,7 +10,7 @@ attrs==23.2.0
10
10
  # referencing
11
11
  beautifulsoup4==4.12.3
12
12
  # via hdx-python-utilities (pyproject.toml)
13
- certifi==2024.6.2
13
+ certifi==2024.7.4
14
14
  # via requests
15
15
  cfgv==3.4.0
16
16
  # via pre-commit
@@ -20,7 +20,7 @@ charset-normalizer==3.3.2
20
20
  # via requests
21
21
  click==8.1.7
22
22
  # via typer
23
- coverage==7.5.4
23
+ coverage==7.6.0
24
24
  # via pytest-cov
25
25
  distlib==0.3.8
26
26
  # via virtualenv
@@ -36,9 +36,9 @@ frictionless==5.17.0
36
36
  # via hdx-python-utilities (pyproject.toml)
37
37
  html5lib==1.1
38
38
  # via hdx-python-utilities (pyproject.toml)
39
- humanize==4.9.0
39
+ humanize==4.10.0
40
40
  # via frictionless
41
- identify==2.5.36
41
+ identify==2.6.0
42
42
  # via pre-commit
43
43
  idna==3.7
44
44
  # via
@@ -54,7 +54,7 @@ jinja2==3.1.4
54
54
  # via frictionless
55
55
  jsonlines==4.0.0
56
56
  # via hdx-python-utilities (pyproject.toml)
57
- jsonschema==4.22.0
57
+ jsonschema==4.23.0
58
58
  # via
59
59
  # frictionless
60
60
  # tableschema-to-template
@@ -84,15 +84,15 @@ platformdirs==4.2.2
84
84
  # via virtualenv
85
85
  pluggy==1.5.0
86
86
  # via pytest
87
- pre-commit==3.7.1
87
+ pre-commit==3.8.0
88
88
  # via hdx-python-utilities (pyproject.toml)
89
- pydantic==2.8.0
89
+ pydantic==2.8.2
90
90
  # via frictionless
91
- pydantic-core==2.20.0
91
+ pydantic-core==2.20.1
92
92
  # via pydantic
93
93
  pygments==2.18.0
94
94
  # via rich
95
- pytest==8.2.2
95
+ pytest==8.3.2
96
96
  # via
97
97
  # hdx-python-utilities (pyproject.toml)
98
98
  # pytest-cov
@@ -127,7 +127,7 @@ rfc3986==2.0.0
127
127
  # via frictionless
128
128
  rich==13.7.1
129
129
  # via typer
130
- rpds-py==0.18.1
130
+ rpds-py==0.19.1
131
131
  # via
132
132
  # jsonschema
133
133
  # referencing
@@ -164,7 +164,7 @@ typing-extensions==4.12.2
164
164
  # typer
165
165
  urllib3==2.2.2
166
166
  # via requests
167
- validators==0.29.0
167
+ validators==0.33.0
168
168
  # via frictionless
169
169
  virtualenv==20.26.3
170
170
  # via pre-commit
@@ -172,6 +172,8 @@ webencodings==0.5.1
172
172
  # via html5lib
173
173
  xlrd==2.0.1
174
174
  # via hdx-python-utilities (pyproject.toml)
175
+ xlsx2csv==0.8.3
176
+ # via hdx-python-utilities (pyproject.toml)
175
177
  xlsxwriter==3.2.0
176
178
  # via tableschema-to-template
177
179
  xlwt==1.3.0
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '3.7.1'
16
- __version_tuple__ = version_tuple = (3, 7, 1)
15
+ __version__ = version = '3.7.3'
16
+ __version_tuple__ = version_tuple = (3, 7, 3)
@@ -15,6 +15,7 @@ from frictionless.resources import TableResource
15
15
  from ratelimit import RateLimitDecorator, sleep_and_retry
16
16
  from requests import Request
17
17
  from ruamel.yaml import YAML
18
+ from xlsx2csv import Xlsx2csv
18
19
 
19
20
  from .base_downloader import BaseDownload, DownloadError
20
21
  from .frictionless_wrapper import get_frictionless_tableresource
@@ -669,6 +670,7 @@ class Download(BaseDownload):
669
670
  **kwargs:
670
671
  format (Optional[str]): Type of file. Defaults to inferring.
671
672
  file_type (Optional[str]): Type of file. Defaults to inferring.
673
+ xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
672
674
  encoding (Optional[str]): Type of encoding. Defaults to inferring.
673
675
  compression (Optional[str]): Type of compression. Defaults to inferring.
674
676
  delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
@@ -690,6 +692,20 @@ class Download(BaseDownload):
690
692
  """
691
693
  if headers is None:
692
694
  raise DownloadError("Argument headers cannot be None!")
695
+ xlsx2csv = kwargs.pop("xlsx2csv", False)
696
+ if xlsx2csv:
697
+ path = self.download_file(url)
698
+ outpath = path.replace(".xlsx", ".csv")
699
+ sheet = kwargs.pop("sheet", 1)
700
+ if isinstance(sheet, int):
701
+ sheet_args = {"sheetid": sheet}
702
+ else:
703
+ sheet_args = {"sheetname": sheet}
704
+ Xlsx2csv(path).convert(outpath, **sheet_args)
705
+ url = outpath
706
+ kwargs["format"] = "csv" # format takes precedence over file_type
707
+ kwargs.pop("fill_merged_cells", None)
708
+
693
709
  resource = self.get_frictionless_tableresource(
694
710
  url,
695
711
  ignore_blank_rows=ignore_blank_rows,
@@ -771,6 +787,7 @@ class Download(BaseDownload):
771
787
  **kwargs:
772
788
  format (Optional[str]): Type of file. Defaults to inferring.
773
789
  file_type (Optional[str]): Type of file. Defaults to inferring.
790
+ xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
774
791
  encoding (Optional[str]): Type of encoding. Defaults to inferring.
775
792
  compression (Optional[str]): Type of compression. Defaults to inferring.
776
793
  delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
@@ -875,6 +892,7 @@ class Download(BaseDownload):
875
892
  **kwargs:
876
893
  format (Optional[str]): Type of file. Defaults to inferring.
877
894
  file_type (Optional[str]): Type of file. Defaults to inferring.
895
+ xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
878
896
  encoding (Optional[str]): Type of encoding. Defaults to inferring.
879
897
  compression (Optional[str]): Type of compression. Defaults to inferring.
880
898
  delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
@@ -950,6 +968,7 @@ class Download(BaseDownload):
950
968
  **kwargs:
951
969
  format (Optional[str]): Type of file. Defaults to inferring.
952
970
  file_type (Optional[str]): Type of file. Defaults to inferring.
971
+ xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
953
972
  encoding (Optional[str]): Type of encoding. Defaults to inferring.
954
973
  compression (Optional[str]): Type of compression. Defaults to inferring.
955
974
  delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
@@ -418,7 +418,7 @@ class Retrieve(BaseDownload):
418
418
  filename (Optional[str]): Filename of saved file. Defaults to getting from url.
419
419
  logstr (Optional[str]): Text to use in log string to describe download. Defaults to filename.
420
420
  fallback (bool): Whether to use static fallback if download fails. Defaults to False.
421
- **kwargs: Parameters to pass to download_file call
421
+ **kwargs: Parameters to pass to download_file and get_tabular_rows calls
422
422
 
423
423
  Returns:
424
424
  Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)
@@ -4,6 +4,7 @@ import difflib
4
4
  import logging
5
5
  import re
6
6
  import string
7
+ import unicodedata
7
8
  from string import punctuation
8
9
  from typing import Any, Dict, List, Optional, Set, Tuple
9
10
 
@@ -16,6 +17,44 @@ PUNCTUATION_MINUS_BRACKETS = r"""!"#$%&'*+,-./:;<=>?@\^_`|~"""
16
17
  TEMPLATE_VARIABLES = re.compile("{{.*?}}")
17
18
 
18
19
 
20
+ KEEP_CHARS_SAME = set(string.ascii_lowercase).union(set(string.digits))
21
+ CHANGE_TO_LOWERCASE = set(string.ascii_uppercase)
22
+ MAP_TO_SPACE = set(string.punctuation).union(set(string.whitespace))
23
+ MAP_TO_SPACE.remove("'")
24
+
25
+
26
+ def normalise(text: str) -> str:
27
+ """
28
+ Mormalise text for example to support name matching. Accented characters
29
+ are replaced with non-accented if possible. Any punctuation and whitespace
30
+ is replaced with a space except for ' which is replaced with blank.
31
+ Multiple spaces are replaced with a single space. Uppercase is replaced
32
+ with lowercase. Spaces at start and end are removed. All non-ASCII
33
+ characters are removed.
34
+
35
+ Args:
36
+ text (str): Text to normalise
37
+
38
+ Returns:
39
+ str: Normalised text
40
+ """
41
+ chars = []
42
+ space = False
43
+ for chr in unicodedata.normalize("NFD", text):
44
+ if chr in KEEP_CHARS_SAME:
45
+ chars.append(chr)
46
+ space = False
47
+ elif chr in CHANGE_TO_LOWERCASE:
48
+ chars.append(chr.lower())
49
+ space = False
50
+ elif chr in MAP_TO_SPACE:
51
+ if space:
52
+ continue
53
+ chars.append(" ")
54
+ space = True
55
+ return "".join(chars).strip()
56
+
57
+
19
58
  def remove_end_characters(
20
59
  string: str, characters_to_remove: str = punctuation
21
60
  ) -> str:
@@ -1220,6 +1220,82 @@ class TestDownloader:
1220
1220
  "Tulkarm",
1221
1221
  ]
1222
1222
 
1223
+ def test_get_tabular_rows_xlsx2csv(self, fixtureurlexcel):
1224
+ expected_headers = [
1225
+ "GWNO",
1226
+ "EVENT_ID_CNTY",
1227
+ "EVENT_ID_NO_CNTY",
1228
+ "EVENT_DATE",
1229
+ "YEAR",
1230
+ "TIME_PRECISION",
1231
+ "EVENT_TYPE",
1232
+ "ACTOR1",
1233
+ "ALLY_ACTOR_1",
1234
+ "INTER1",
1235
+ "ACTOR2",
1236
+ "ALLY_ACTOR_2",
1237
+ "INTER2",
1238
+ "INTERACTION",
1239
+ "COUNTRY",
1240
+ "ADMIN1",
1241
+ "ADMIN2",
1242
+ "ADMIN3",
1243
+ "LOCATION",
1244
+ "LATITUDE",
1245
+ "LONGITUDE",
1246
+ "GEO_PRECISION",
1247
+ "SOURCE",
1248
+ "NOTES",
1249
+ "FATALITIES",
1250
+ ]
1251
+
1252
+ expected_row = [
1253
+ "615",
1254
+ "1416RTA",
1255
+ None,
1256
+ "18/04/2001",
1257
+ "2001",
1258
+ "1",
1259
+ "Violence against civilians",
1260
+ "Police Forces of Algeria (1999-)",
1261
+ None,
1262
+ "1",
1263
+ "Civilians (Algeria)",
1264
+ "Berber Ethnic Group (Algeria)",
1265
+ "7",
1266
+ "17",
1267
+ "Algeria",
1268
+ "Tizi Ouzou",
1269
+ "Beni-Douala",
1270
+ None,
1271
+ "Beni Douala",
1272
+ "36.61954",
1273
+ "4.08282",
1274
+ "1",
1275
+ "Associated Press Online",
1276
+ "A Berber student was shot while in police custody at a police station in "
1277
+ "Beni Douala. He later died on Apr.21.",
1278
+ "1",
1279
+ ]
1280
+
1281
+ with Download() as downloader:
1282
+ headers, iterator = downloader.get_tabular_rows(
1283
+ fixtureurlexcel,
1284
+ format="xlsx",
1285
+ xlsx2csv=True,
1286
+ )
1287
+ assert headers == expected_headers
1288
+ assert list(iterator)[0] == expected_row
1289
+
1290
+ headers, iterator = downloader.get_tabular_rows(
1291
+ fixtureurlexcel,
1292
+ format="xlsx",
1293
+ xlsx2csv=True,
1294
+ sheet="test_data",
1295
+ )
1296
+ assert headers == expected_headers
1297
+ assert list(iterator)[0] == expected_row
1298
+
1223
1299
  def test_get_tabular_rows_json(self, fixturejsonurl):
1224
1300
  with Download() as downloader:
1225
1301
  headers, iterator = downloader.get_tabular_rows(
@@ -15,6 +15,7 @@ from hdx.utilities.text import (
15
15
  get_words_in_sentence,
16
16
  match_template_variables,
17
17
  multiple_replace,
18
+ normalise,
18
19
  number_format,
19
20
  only_allowed_in_str,
20
21
  remove_end_characters,
@@ -28,6 +29,14 @@ class TestText:
28
29
  b = "The quicker brown fox leapt over the slower fox. It was so fast!"
29
30
  c = "The quick brown fox climbed over the lazy dog. It was so fast!"
30
31
 
32
+ def test_normalise(self):
33
+ assert (
34
+ normalise(
35
+ "£^*& ()+-[]<>?|\ Al DhaleZ'eÉ / الضالع,,..1234''#~~### "
36
+ )
37
+ == "al dhalezee 1234"
38
+ )
39
+
31
40
  def test_remove_end_characters(self):
32
41
  assert remove_end_characters('lalala,.,"') == "lalala"
33
42
  assert (