readabs 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
readabs/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from . import readabs
2
+ from .readabs import *
3
+
4
+ __version__ = "0.0.2"
@@ -0,0 +1,56 @@
1
+ """Catalogue map for ABS data."""
2
+
3
+ from io import StringIO
4
+
5
+ from pandas import DataFrame, read_csv
6
+ def catalogue_map() -> DataFrame:
7
+ """Return the catalogue map."""
8
+
9
+ csv = """Catalogue ID,Theme,Parent Topic,Topic,URL,Status
10
+ 1364.0.15.003,Economy,National Accounts,Modellers Database,https://www.abs.gov.au/statistics/economy/national-accounts/modellers-database/latest-release,
11
+ 3101.0,People,Population,National State And Territory Population,https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/latest-release,
12
+ 3222.0,People,Population,Population Projections Australia,https://www.abs.gov.au/statistics/people/population/population-projections-australia/latest-release,
13
+ 3401.0,Industry,Tourism And Transport,Overseas Arrivals And Departures Australia,https://www.abs.gov.au/statistics/industry/tourism-and-transport/overseas-arrivals-and-departures-australia/latest-release,
14
+ 5204.0,Economy,National Accounts,Australian System National Accounts,https://www.abs.gov.au/statistics/economy/national-accounts/australian-system-national-accounts/latest-release,
15
+ 5206.0,Economy,National Accounts,Australian National Accounts National Income Expenditure And Product,https://www.abs.gov.au/statistics/economy/national-accounts/australian-national-accounts-national-income-expenditure-and-product/latest-release,
16
+ 5220.0,Economy,National Accounts,Australian National Accounts State Accounts,https://www.abs.gov.au/statistics/economy/national-accounts/australian-national-accounts-state-accounts/latest-release,
17
+ 5232.0,Economy,National Accounts,Australian National Accounts Finance And Wealth,https://www.abs.gov.au/statistics/economy/national-accounts/australian-national-accounts-finance-and-wealth/latest-release,
18
+ 5232.0.55.001,Economy,Finance,Assets And Liabilities Australian Securitisers,https://www.abs.gov.au/statistics/economy/finance/assets-and-liabilities-australian-securitisers/latest-release,
19
+ 5302.0,Economy,International Trade,Balance Payments And International Investment Position Australia,https://www.abs.gov.au/statistics/economy/international-trade/balance-payments-and-international-investment-position-australia/latest-release,
20
+ 5368.0,Economy,International Trade,International Trade Goods And Services Australia,https://www.abs.gov.au/statistics/economy/international-trade/international-trade-goods-and-services-australia/latest-release,
21
+ 5368.0.55.024,Economy,International Trade,International Merchandise Trade Preliminary Australia,https://www.abs.gov.au/statistics/economy/international-trade/international-merchandise-trade-preliminary-australia/latest-release,
22
+ 5601.0,Economy,Finance,Lending Indicators,https://www.abs.gov.au/statistics/economy/finance/lending-indicators/latest-release,
23
+ 5625.0,Economy,Business Indicators,Private New Capital Expenditure And Expected Expenditure Australia,https://www.abs.gov.au/statistics/economy/business-indicators/private-new-capital-expenditure-and-expected-expenditure-australia/latest-release,
24
+ 5655.0,Economy,Finance,Managed Funds Australia,https://www.abs.gov.au/statistics/economy/finance/managed-funds-australia/latest-release,
25
+ 5676.0,Economy,Business Indicators,Business Indicators Australia,https://www.abs.gov.au/statistics/economy/business-indicators/business-indicators-australia/latest-release,
26
+ 5681.0,Economy,Business Indicators,Monthly Business Turnover Indicator,https://www.abs.gov.au/statistics/economy/business-indicators/monthly-business-turnover-indicator/latest-release,
27
+ 5682.0,Economy,Finance,Monthly Household Spending Indicator,https://www.abs.gov.au/statistics/economy/finance/monthly-household-spending-indicator/latest-release,
28
+ 6202.0,Labour,Employment And Unemployment,Labour Force Australia,https://www.abs.gov.au/statistics/labour/employment-and-unemployment/labour-force-australia/latest-release,
29
+ 6150.0.55.003,Labour,Labour Accounts,Labour Account Australia,https://www.abs.gov.au/statistics/labour/labour-accounts/labour-account-australia/latest-release,
30
+ 6248.0.55.002,Labour,Employment And Unemployment,Public Sector Employment And Earnings,https://www.abs.gov.au/statistics/labour/employment-and-unemployment/public-sector-employment-and-earnings/latest-release,
31
+ 6291.0.55.001,Labour,Employment And Unemployment,Labour Force Australia Detailed,https://www.abs.gov.au/statistics/labour/employment-and-unemployment/labour-force-australia-detailed/latest-release,
32
+ 6302.0,Labour,Earnings And Working Conditions,Average Weekly Earnings Australia,https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/average-weekly-earnings-australia/latest-release,
33
+ 6321.0.55.001,Labour,Earnings And Working Conditions,Industrial Disputes Australia,https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/industrial-disputes-australia/latest-release,
34
+ 6345.0,Economy,Price Indexes And Inflation,Wage Price Index Australia,https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/wage-price-index-australia/latest-release,
35
+ 6354.0,Labour,Jobs,Job Vacancies Australia,https://www.abs.gov.au/statistics/labour/jobs/job-vacancies-australia/latest-release,
36
+ 6401.0,Economy,Price Indexes And Inflation,Consumer Price Index Australia,https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/consumer-price-index-australia/latest-release,
37
+ 6416.0,Economy,Price Indexes And Inflation,Residential Property Price Indexes Eight Capital Cities,https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/residential-property-price-indexes-eight-capital-cities/latest-release,Ceased
38
+ 6427.0,Economy,Price Indexes And Inflation,Producer Price Indexes Australia,https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/producer-price-indexes-australia/latest-release,
39
+ 6432.0,Economy,Price Indexes And Inflation,Total Value Dwellings,https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/total-value-dwellings/latest-release,
40
+ 6457.0,Economy,Price Indexes And Inflation,International Trade Price Indexes Australia,https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/international-trade-price-indexes-australia/latest-release,
41
+ 6467.0,Economy,Price Indexes And Inflation,Selected Living Cost Indexes Australia,https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/selected-living-cost-indexes-australia/latest-release,
42
+ 6484.0,Economy,Price Indexes And Inflation,Monthly Consumer Price Index Indicator,https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/monthly-consumer-price-index-indicator/latest-release,
43
+ 7215.0,Industry,Agriculture,Livestock Products Australia,https://www.abs.gov.au/statistics/industry/agriculture/livestock-products-australia/latest-release,
44
+ 7218.0.55.001,Industry,Agriculture,Livestock And Meat Australia,https://www.abs.gov.au/statistics/industry/agriculture/livestock-and-meat-australia/latest-release,Ceased
45
+ 8155.0,Industry,Industry Overview,Australian Industry,https://www.abs.gov.au/statistics/industry/industry-overview/australian-industry/latest-release,
46
+ 8165.0,Economy,Business Indicators,Counts Australian Businesses Including Entries And Exits,https://www.abs.gov.au/statistics/economy/business-indicators/counts-australian-businesses-including-entries-and-exits/latest-release,
47
+ 8412.0,Industry,Mining,Mineral And Petroleum Exploration Australia,https://www.abs.gov.au/statistics/industry/mining/mineral-and-petroleum-exploration-australia/latest-release,
48
+ 8501.0,Industry,Retail And Wholesale Trade,Retail Trade Australia,https://www.abs.gov.au/statistics/industry/retail-and-wholesale-trade/retail-trade-australia/latest-release,
49
+ 8701.0,Industry,Building And Construction,Estimated Dwelling Stock,https://www.abs.gov.au/statistics/industry/building-and-construction/estimated-dwelling-stock/latest-release,
50
+ 8731.0,Industry,Building And Construction,Building Approvals Australia,https://www.abs.gov.au/statistics/industry/building-and-construction/building-approvals-australia/latest-release,
51
+ 8752.0,Industry,Building And Construction,Building Activity Australia,https://www.abs.gov.au/statistics/industry/building-and-construction/building-activity-australia/latest-release,
52
+ 8755.0,Industry,Building And Construction,Construction Work Done Australia Preliminary,https://www.abs.gov.au/statistics/industry/building-and-construction/construction-work-done-australia-preliminary/latest-release,
53
+ 8762.0,Industry,Building And Construction,Engineering Construction Activity Australia,https://www.abs.gov.au/statistics/industry/building-and-construction/engineering-construction-activity-australia/latest-release,
54
+ 8782.0.65.001,Industry,Building And Construction,Construction Activity Chain Volume Measures Australia,https://www.abs.gov.au/statistics/industry/building-and-construction/construction-activity-chain-volume-measures-australia/jun-2020,Ceased
55
+ """
56
+ return read_csv(StringIO(csv), index_col=0)
@@ -0,0 +1,40 @@
1
+ """abs_meta_data_sypport.py
2
+
3
+ Support for working with ABS meta data."""
4
+
5
+ from collections import namedtuple
6
+
7
+ Metacol = namedtuple(
8
+ "Metacol",
9
+ [
10
+ "did",
11
+ "stype",
12
+ "id",
13
+ "start",
14
+ "end",
15
+ "num",
16
+ "unit",
17
+ "dtype",
18
+ "freq",
19
+ "cmonth",
20
+ "table",
21
+ "tdesc",
22
+ "cat",
23
+ ],
24
+ )
25
+
26
+ metacol = Metacol(
27
+ did="Data Item Description",
28
+ stype="Series Type",
29
+ id="Series ID",
30
+ start="Series Start",
31
+ end="Series End",
32
+ num="No. Obs.",
33
+ unit="Unit",
34
+ dtype="Data Type",
35
+ freq="Freq.",
36
+ cmonth="Collection Month",
37
+ table="Table",
38
+ tdesc="Table Description",
39
+ cat="Catalogue number",
40
+ )
@@ -0,0 +1,218 @@
1
+ """download_cache.py - a module for downloading and caching data from the web.
2
+
3
+ The default cache directory can be specified by setting the environment
4
+ variable READABS_CACHE_DIR."""
5
+
6
+ # --- imports
7
+ # system imports
8
+ from hashlib import md5
9
+ import re
10
+ from datetime import datetime, timezone
11
+ from os import utime, getenv
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ # data imports
16
+ import pandas as pd
17
+ import requests
18
+
19
+
20
+ # --- constants
21
+ # define the default cache directory
22
+ DEFAULT_CACHE_DIR = "./.readabs_cache"
23
+ READABS_CACHE_DIR = getenv("READABS_CACHE_DIR", DEFAULT_CACHE_DIR)
24
+ READABS_CACHE_PATH = Path(READABS_CACHE_DIR)
25
+
26
+ DOWNLOAD_TIMEOUT = 60 # seconds
27
+
28
+
29
+ # --- Exception classes
30
+ class HttpError(Exception):
31
+ """A problem retrieving data from HTTP."""
32
+
33
+
34
+ class CacheError(Exception):
35
+ """A problem retrieving data from the cache."""
36
+
37
+
38
+ # --- functions
39
+ def check_for_bad_response(
40
+ url: str,
41
+ response: requests.Response,
42
+ **kwargs: Any,
43
+ ) -> bool:
44
+ """Raise an Exception if we could not retrieve the URL.
45
+ If "ignore_errors" is True, return True if there is a problem,
46
+ otherwise raise an exception if there is a problem."""
47
+
48
+ ignore_errors = kwargs.get("ignore_errors", False)
49
+ code = response.status_code
50
+ if code != 200 or response.headers is None:
51
+ problem = f"Problem {code} accessing: {url}."
52
+ if not ignore_errors:
53
+ raise HttpError(problem)
54
+ print(problem)
55
+ return True
56
+
57
+ return False
58
+
59
+
60
+ def request_get(
61
+ url: str,
62
+ **kwargs: Any,
63
+ ) -> bytes:
64
+ """Use python requests to get the contents of the specified URL.
65
+ Depending on "ignore_errors", if something goes wrong, we either
66
+ raise an exception or return an empty bytes object."""
67
+
68
+ # Initialise variables
69
+ verbose = kwargs.get("verbose", False)
70
+ ignore_errors = kwargs.get("ignore_errors", False)
71
+
72
+ if verbose:
73
+ print(f"About to request/download: {url}")
74
+
75
+ try:
76
+ gotten = requests.get(url, allow_redirects=True, timeout=DOWNLOAD_TIMEOUT)
77
+ except requests.exceptions.RequestException as e:
78
+ error = f"request_get(): there was a problem downloading {url}."
79
+ if not ignore_errors:
80
+ raise HttpError(error) from e
81
+ print(error)
82
+ return b""
83
+
84
+ if check_for_bad_response(url, gotten, **kwargs):
85
+ # Note: check_for_bad_response() will raise an exception
86
+ # if it encounters a problem and ignore_errors is False.
87
+ # Otherwise it will print an error message and return True
88
+ return b""
89
+
90
+ return gotten.content # bytes
91
+
92
+
93
+ def save_to_cache(
94
+ file: Path,
95
+ contents: bytes,
96
+ **kwargs: Any,
97
+ ) -> None:
98
+ """Save bytes to the file-system."""
99
+
100
+ verbose = kwargs.get("verbose", False)
101
+ if len(contents) == 0:
102
+ # dont save empty files (probably caused by ignoring errors)
103
+ return
104
+ if file.exists():
105
+ if verbose:
106
+ print("Removing old cache file.")
107
+ file.unlink()
108
+ if verbose:
109
+ print(f"About to save to cache: {file}")
110
+ file.open(mode="w", buffering=-1, encoding=None, errors=None, newline=None)
111
+ file.write_bytes(contents)
112
+
113
+
114
+ def retrieve_from_cache(file: Path, **kwargs: Any) -> bytes:
115
+ """Retrieve bytes from file-system."""
116
+
117
+ verbose = kwargs.get("verbose", False)
118
+ ignore_errors = kwargs.get("ignore_errors", False)
119
+
120
+ if not file.exists() or not file.is_file():
121
+ message = f"Cached file not available: {file.name}"
122
+ if ignore_errors:
123
+ print(message)
124
+ return b""
125
+ raise CacheError(message)
126
+ if verbose:
127
+ print(f"Retrieving from cache: {file}")
128
+ return file.read_bytes()
129
+
130
+
131
+ def get_file(
132
+ url: str,
133
+ cache_dir: Path = READABS_CACHE_PATH,
134
+ cache_prefix: str = "cache",
135
+ **kwargs: Any,
136
+ ) -> bytes:
137
+ """Get a file from URL or local file-system cache, depending on freshness.
138
+ Note: we create the cache_dir if it does not exist.
139
+ Returns: the contents of the file as bytes."""
140
+
141
+ def get_fpath() -> Path:
142
+ """Convert URL string into a cache file name,
143
+ then return as a Path object."""
144
+ bad_cache_pattern = r'[~"#%&*:<>?\\{|}]+' # chars to remove from name
145
+ hash_name = md5(url.encode("utf-8")).hexdigest()
146
+ tail_name = url.split("/")[-1].split("?")[0]
147
+ file_name = re.sub(
148
+ bad_cache_pattern, "", f"{cache_prefix}--{hash_name}--{tail_name}"
149
+ )
150
+ return Path(cache_dir / file_name)
151
+
152
+ # create and check cache_dir is a directory
153
+ cache_dir.mkdir(parents=True, exist_ok=True)
154
+ if not cache_dir.is_dir():
155
+ raise CacheError(f"Cache path is not a directory: {cache_dir.name}")
156
+
157
+ # get URL modification time in UTC
158
+ response = requests.head(url, allow_redirects=True, timeout=20)
159
+ if not check_for_bad_response(url, response, **kwargs):
160
+ source_time = response.headers.get("Last-Modified", None)
161
+ else:
162
+ source_time = None
163
+ source_mtime = (
164
+ None if source_time is None else pd.to_datetime(source_time, utc=True)
165
+ )
166
+
167
+ # get cache modification time in UTC
168
+ target_mtime: datetime | None = None
169
+ file_path = get_fpath()
170
+ if file_path.exists() and file_path.is_file():
171
+ target_mtime = pd.to_datetime(
172
+ datetime.fromtimestamp(file_path.stat().st_mtime, tz=timezone.utc), utc=True
173
+ )
174
+
175
+ # get and save URL source data
176
+ if target_mtime is None or ( # cache is empty, or
177
+ source_mtime is not None
178
+ and source_mtime > target_mtime # URL is fresher than cache
179
+ ):
180
+ url_bytes = request_get(url, **kwargs) # raises exception if it fails
181
+ save_to_cache(file_path, url_bytes, **kwargs)
182
+ # - change file mod time to reflect mtime at URL
183
+ if source_mtime is not None and len(url_bytes) > 0:
184
+ unixtime = source_mtime.value / 1_000_000_000 # convert to seconds
185
+ utime(file_path, (unixtime, unixtime))
186
+ return url_bytes
187
+
188
+ # return the data that has been cached previously
189
+ return retrieve_from_cache(file_path, **kwargs)
190
+
191
+
192
+ # --- preliminary testing:
193
+ DO_TEST = False
194
+ if __name__ == "__main__" and DO_TEST:
195
+
196
+ def cache_test() -> None:
197
+ """This function provides a quick test of the retrieval
198
+ and caching system. You may need to first clear the
199
+ cache directory to see the effect of the cache."""
200
+
201
+ # prepare the test case
202
+ url1 = (
203
+ "https://www.abs.gov.au/statistics/labour/employment-and-"
204
+ + "unemployment/labour-force-australia/nov-2023/6202001.xlsx"
205
+ )
206
+
207
+ # implement - first retrieval is from the web, second from the cache
208
+ width = 20
209
+ print("Test commencing.")
210
+ for u in (url1, url1):
211
+ print("=" * width)
212
+ content = get_file(u, verbose=True)
213
+ print("-" * width)
214
+ print(f"{len(content)} bytes retrieved from {u}.")
215
+ print("=" * width)
216
+ print("Test completed.")
217
+
218
+ cache_test()
@@ -0,0 +1,56 @@
1
+ """Generate the catalogue_map.py file."""
2
+
3
+ # --- imports
4
+ from io import StringIO
5
+ import pandas as pd
6
+ from pandas import DataFrame, Series, Index
7
+ from download_cache import get_file
8
+
9
+
10
+ # --- functions
11
+ # private
12
+ def _get_abs_directory() -> DataFrame:
13
+ """Return a DataFrame of ABS Catalogue numbers."""
14
+
15
+ # get ABS web page of catalogue numbers
16
+ url = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory"
17
+ page = get_file(url)
18
+ links = pd.read_html(StringIO(page.decode("utf-8")), extract_links="body")[
19
+ 1
20
+ ] # second table on the page
21
+
22
+ # extract catalogue numbers
23
+ cats = links["Catalogue Number"].apply(Series)[0]
24
+ urls = links["Topic"].apply(Series)[1]
25
+ root = "https://www.abs.gov.au/statistics/"
26
+ snip = urls.str.replace(root, "")
27
+ snip = (
28
+ snip[~snip.str.contains("http")].str.replace("-", " ").str.title()
29
+ ) # remove bad cases
30
+ frame = snip.str.split("/", expand=True).iloc[:, :3]
31
+ frame.columns = Index(["Theme", "Parent Topic", "Topic"])
32
+ frame["URL"] = urls
33
+ cats = cats[frame.index]
34
+ cat_index = cats.str.replace("(Ceased)", "").str.strip()
35
+ status = Series(" ", index=cats.index).where(cat_index == cats, "Ceased")
36
+ frame["Status"] = status
37
+ frame.index = Index(cat_index)
38
+ frame.index.name = "Catalogue ID"
39
+ return frame
40
+
41
+
42
+ def produce_catalogue_map():
43
+ """Generate the catalogue_map.py file."""
44
+ directory = _get_abs_directory()
45
+ with open("abs_catalogue_map.py", "w", encoding="utf-8") as file:
46
+ file.write('"""Catalogue map for ABS data."""\n\n')
47
+ file.write("from io import StringIO\n\n")
48
+ file.write("from pandas import DataFrame, read_csv\n")
49
+ file.write("def catalogue_map() -> DataFrame:\n")
50
+ file.write(' """Return the catalogue map."""\n\n')
51
+ file.write(f' csv = """{directory.to_csv()}"""\n')
52
+ file.write(" return read_csv(StringIO(csv), index_col=0)\n")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ produce_catalogue_map()
@@ -0,0 +1,121 @@
1
+ """To do"""
2
+
3
+ import re
4
+ from typing import Any
5
+ from bs4 import BeautifulSoup
6
+
7
+ # local imports - ugly, need to find out how to fix thiscd
8
+ if __package__ is None or __package__ == "":
9
+ from download_cache import get_file, HttpError, CacheError
10
+ else:
11
+ from .download_cache import get_file, HttpError, CacheError
12
+
13
+
14
+ # private
15
+ def _make_absolute_url(url: str, prefix: str = "https://www.abs.gov.au") -> str:
16
+ """Convert a relative URL address found on the ABS site to
17
+ an absolute URL address."""
18
+
19
+ # remove a prefix if it already exists (just to be sure)
20
+ url = url.replace(prefix, "")
21
+ url = url.replace(prefix.replace("https://", "http://"), "")
22
+ # then add the prefix (back) ...
23
+ return f"{prefix}{url}"
24
+
25
+
26
+ # public (also used by read_abs_cat.py)
27
+ def get_table_name(url: str) -> str:
28
+ """Get the table name from the ABS URL."""
29
+
30
+ tail = url.rsplit("/", 1)[-1]
31
+ table_name = tail.split(".")[0]
32
+ return table_name
33
+
34
+
35
+ # private
36
+ def historicise_links(
37
+ link_dict: dict[str, list[str]], history: str
38
+ ) -> dict[str, list[str]]:
39
+ """Age an ABS link so that it points to a historical version of the data.
40
+ Note: the history string is typically in "mon-yr" format, but not alwayts.
41
+ Note: we are also assuming that the date is in the second last part of the URL.
42
+ These assumptions may not always hold."""
43
+
44
+ new_dict = {}
45
+ for link_type, link_list in link_dict.items():
46
+ new_list = []
47
+ for link in link_list:
48
+ head, _, tail = link.rsplit("/", 2)
49
+ replacement = "/".join([head, history, tail])
50
+ new_list.append(replacement)
51
+ new_dict[link_type] = new_list
52
+
53
+ return new_dict
54
+
55
+
56
+ # public
57
+ def get_data_links(
58
+ url: str, # the URL of the ABS page to scan
59
+ inspect_file_name="", # for debugging - save the page to disk
60
+ **kwargs: Any,
61
+ ) -> dict[str, list[str]]:
62
+ """Scan the webpage at the ABS URL for links to ZIP files and for
63
+ links to Microsoft Excel files.
64
+ Return the links in a dictionary of lists undexed by file type ending.
65
+ Ensure relative links have been fully expanded to be absolute links."""
66
+
67
+ # get relevant web-page from ABS website
68
+ verbose = kwargs.get("verbose", False)
69
+ if verbose:
70
+ print("Getting data links from the ABS web page.")
71
+ try:
72
+ page = get_file(url, **kwargs)
73
+ except (HttpError, CacheError) as e:
74
+ print(f"Error when obtaining links from ABS web page: {e}")
75
+ return {}
76
+
77
+ # save the HTML webpage to disk for inspection
78
+ if inspect_file_name:
79
+ with open(inspect_file_name, "w", encoding="utf-8") as file_handle:
80
+ file_handle.write(page.decode("utf-8"))
81
+
82
+ # remove those pesky span tags - probably not necessary
83
+ page = re.sub(b"<span[^>]*>", b" ", page)
84
+ page = re.sub(b"</span>", b" ", page)
85
+ page = re.sub(b"\\s+", b" ", page) # tidy up white space
86
+
87
+ # capture all links (of ZIP and Microsoft Excel types)
88
+ link_types = (
89
+ ".zip",
90
+ ".xlsx",
91
+ ) # must be lower case
92
+ soup = BeautifulSoup(page, features="lxml")
93
+ link_dict: dict[str, list[str]] = {}
94
+ for link in soup.findAll("a"):
95
+ url = link.get("href")
96
+ if url is None:
97
+ # ignore silly cases
98
+ continue
99
+ if "pivot" in url.rsplit("/", 1)[-1].lower():
100
+ # ignore pivot tables
101
+ continue
102
+ for link_type in link_types:
103
+ if url.lower().endswith(link_type):
104
+ if link_type not in link_dict:
105
+ link_dict[link_type] = []
106
+ link_dict[link_type].append(_make_absolute_url(url))
107
+ break
108
+
109
+ # age links if required
110
+ history = kwargs.get("history", "")
111
+ if history:
112
+ link_dict = historicise_links(link_dict, history)
113
+
114
+ if verbose:
115
+ print("Found links to the following ABS data tables:")
116
+ for link_type, link_list in link_dict.items():
117
+ summary = [get_table_name(x) for x in link_list] # just the file name
118
+ print(f"Found: {len(link_list)} items of type {link_type}: {summary}")
119
+ print()
120
+
121
+ return link_dict
@@ -0,0 +1,389 @@
1
+ """read_abs_cat.py
2
+
3
+ Download all/selected timeseries data from the
4
+ Australian Bureau of Statistics (ABS) for a specified
5
+ ABS catalogue identifier and package that data into a
6
+ dictionary of DataFrames."""
7
+
8
+ # --- imports ---
9
+ # standard library imports
10
+ import calendar
11
+ import zipfile
12
+ from functools import cache
13
+ from io import BytesIO
14
+ from typing import Any, Callable, cast
15
+
16
+ # analytic imports
17
+ import pandas as pd
18
+ from pandas import DataFrame
19
+
20
+ # local imports - ugly, need to find out how to fix this
21
+ #print(f"in read_abs_cat.py: __main__={__name__}, __package__={__package__}")
22
+ if __package__ is None or __package__ == "":
23
+ from abs_meta_data_support import metacol
24
+ from get_data_links import get_data_links, get_table_name
25
+ from abs_catalogue_map import catalogue_map
26
+ from read_support import check_kwargs, get_args
27
+ from download_cache import get_file
28
+ else:
29
+ from .abs_meta_data_support import metacol
30
+ from .get_data_links import get_data_links, get_table_name
31
+ from .abs_catalogue_map import catalogue_map
32
+ from .read_support import check_kwargs, get_args
33
+ from .download_cache import get_file
34
+
35
+
36
+ # --- functions ---
37
+ # private
38
+ def _get_meta_from_excel(
39
+ excel: pd.ExcelFile,
40
+ table: str,
41
+ tab_desc: str,
42
+ cat_id: str,
43
+ ) -> pd.DataFrame:
44
+ """Capture the metadata from the Index sheet of an ABS excel file.
45
+ Returns a DataFrame specific to the current excel file.
46
+ Returning an empty DataFrame, means that the meatadata could not
47
+ be identified. Meta data for each ABS data item is organised by row."""
48
+
49
+ # Unfortunately, the header for some of the 3401.0
50
+ # spreadsheets starts on row 10
51
+ starting_rows = 9, 10
52
+ required = metacol.did, metacol.id, metacol.stype, metacol.unit
53
+ required_set = set(required)
54
+ for header_row in starting_rows:
55
+ file_meta = excel.parse(
56
+ "Index",
57
+ header=header_row,
58
+ parse_dates=True,
59
+ infer_datetime_format=True,
60
+ converters={"Unit": str},
61
+ )
62
+ file_meta = file_meta.iloc[1:-2] # drop first and last 2
63
+ file_meta = file_meta.dropna(axis="columns", how="all")
64
+
65
+ if required_set.issubset(set(file_meta.columns)):
66
+ break
67
+
68
+ if header_row == starting_rows[-1]:
69
+ print(f"Could not find metadata for {cat_id}-{tab_desc}")
70
+ return pd.DataFrame()
71
+
72
+ # add the table name and table description to the metadata
73
+ file_meta[metacol.table] = table.strip()
74
+ file_meta[metacol.tdesc] = tab_desc.strip()
75
+ file_meta[metacol.cat] = cat_id.strip()
76
+
77
+ # make damn sure there are no rogue white spaces
78
+ for col in required:
79
+ file_meta[col] = file_meta[col].str.strip()
80
+
81
+ return file_meta
82
+
83
+
84
+ # private
85
+ def _unpack_excel_into_df(
86
+ excel: pd.ExcelFile,
87
+ meta: DataFrame,
88
+ freq: str,
89
+ verbose: bool,
90
+ ) -> DataFrame:
91
+ """Take an ABS excel file and put all the Data sheets into a single
92
+ pandas DataFrame and return that DataFrame."""
93
+
94
+ data = DataFrame()
95
+ data_sheets = [x for x in excel.sheet_names if cast(str, x).startswith("Data")]
96
+ for sheet_name in data_sheets:
97
+ sheet_data = excel.parse(
98
+ sheet_name,
99
+ header=9,
100
+ index_col=0,
101
+ ).dropna(how="all", axis="index")
102
+ data.index = pd.to_datetime(data.index)
103
+
104
+ # merge data into a large dataframe
105
+ if len(data) == 0:
106
+ data = sheet_data
107
+ else:
108
+ data = pd.merge(
109
+ left=data,
110
+ right=sheet_data,
111
+ how="outer",
112
+ left_index=True,
113
+ right_index=True,
114
+ suffixes=("", ""),
115
+ )
116
+ if freq:
117
+ if freq in ("Q", "A"):
118
+ month = calendar.month_abbr[
119
+ cast(pd.PeriodIndex, data.index).month.max()
120
+ ].upper()
121
+ freq = f"{freq}-{month}"
122
+ if isinstance(data.index, pd.DatetimeIndex):
123
+ data = data.to_period(freq=freq)
124
+
125
+ # check for NA columns - rarely happens
126
+ # Note: these empty columns are not removed,
127
+ # but it is useful to know they are there
128
+ if data.isna().all().any() and verbose:
129
+ cols = data.columns[data.isna().all()]
130
+ print(
131
+ "Caution: these columns are all NA in "
132
+ + f"{meta[metacol.table].iloc[0]}: {cols}"
133
+ )
134
+
135
+ # check for duplicate columns - should not happen
136
+ # Note: these duplicate columns are removed
137
+ duplicates = data.columns.duplicated()
138
+ if duplicates.any():
139
+ if verbose:
140
+ dup_table = meta[metacol.table].iloc[0]
141
+ print(
142
+ f"Note: duplicates removed from {dup_table}: "
143
+ + f"{data.columns[duplicates]}"
144
+ )
145
+ data = data.loc[:, ~duplicates].copy()
146
+ return data
147
+
148
+
149
+ # private
150
+ def _extract_data_from_excel(
151
+ raw_bytes: bytes, table_name: str, **kwargs: Any
152
+ ) -> tuple[DataFrame, DataFrame]:
153
+ """Convert the raw bytes of an Excel file into a pandas DataFrame.
154
+ Returns the actual data and meta data in two separate DataFrames."""
155
+
156
+ ignore_errors = kwargs.get("ignore_errors", False)
157
+
158
+ # convert the raw bytes into a pandas ExcelFile
159
+ try:
160
+ excel = pd.ExcelFile(BytesIO(raw_bytes))
161
+ except Exception as e:
162
+ message = f"With {table_name}: could not convert raw bytes to ExcelFile.\n{e}"
163
+ if ignore_errors:
164
+ print(message)
165
+ return pd.DataFrame(), pd.DataFrame()
166
+ raise RuntimeError(message) from e
167
+
168
+ excel = pd.ExcelFile(BytesIO(raw_bytes))
169
+
170
+ # get table information (ie. the meta data)
171
+ if "Index" not in excel.sheet_names:
172
+ print(
173
+ "Caution: Could not find the 'Index' "
174
+ f"sheet in {table_name}. File not included"
175
+ )
176
+ return pd.DataFrame(), pd.DataFrame()
177
+
178
+ # get table header information
179
+ header = excel.parse("Index", nrows=8) # ???
180
+ cat_id = header.iat[3, 1].split(" ")[0].strip()
181
+ tab_desc = header.iat[4, 1].split(".", 1)[-1].strip()
182
+
183
+ # get the metadata rows
184
+ file_meta = _get_meta_from_excel(excel, table_name, tab_desc, cat_id)
185
+ if len(file_meta) == 0:
186
+ return pd.DataFrame(), pd.DataFrame()
187
+
188
+ # establish freq - used for making the index a PeriodIndex
189
+ freq_dict = {"annual": "Y", "biannual": "Q", "quarter": "Q", "month": "M"}
190
+ freqlist = file_meta["Freq."].str.lower().unique()
191
+ if not len(freqlist) == 1 or freqlist[0] not in freq_dict:
192
+ print(f"Unrecognised data frequency {freqlist} for {tab_desc}")
193
+ return pd.DataFrame(), pd.DataFrame()
194
+ freq = freq_dict[freqlist[0]]
195
+
196
+ data = _unpack_excel_into_df(
197
+ excel, file_meta, freq, verbose=kwargs.get("verbose", False)
198
+ )
199
+
200
+ return data, file_meta
201
+
202
+
203
+ # private
204
+ def _process_zip_binary(
205
+ zip_contents: bytes,
206
+ **kwargs: Any,
207
+ ) -> tuple[dict[str, DataFrame], DataFrame]:
208
+ """Extract the contents of a ZIP file into a tuple, where the
209
+ first element is a dictionary of DataFrames; and the second
210
+ element is the related ABS meta data in a DataFrame."""
211
+
212
+ verbose = kwargs.get("verbose", False)
213
+ if verbose:
214
+ print("Extracting DataFrames from the zip-file binary.")
215
+ returnable_data: dict[str, DataFrame] = {}
216
+ returnable_meta = DataFrame()
217
+
218
+ with zipfile.ZipFile(BytesIO(zip_contents)) as zipped:
219
+ for count, element in enumerate(zipped.infolist()):
220
+ # get the zipfile into pandas
221
+ table_name = get_table_name(url=element.filename)
222
+ raw_bytes = zipped.read(element.filename)
223
+ excel_df, file_meta = _extract_data_from_excel(
224
+ raw_bytes, table_name, **kwargs
225
+ )
226
+ if len(excel_df) == 0:
227
+ # this table could not be captured
228
+ continue
229
+
230
+ # fix tabulation if ABS used the same table numbers for data
231
+ if table_name in returnable_data:
232
+ # This really just should not happen, but if it does, we need to dix it
233
+ tmp = f"{table_name}-{count}"
234
+ if verbose:
235
+ print(f"Changing duplicate table name from {table_name} to {tmp}.")
236
+ table_name = tmp
237
+ file_meta[metacol.table] = table_name
238
+
239
+ # aggregate the meta data
240
+ returnable_meta = pd.concat([returnable_meta, file_meta])
241
+
242
+ # add the table to the returnable dictionary
243
+ returnable_data[table_name] = excel_df
244
+
245
+ return returnable_data, returnable_meta
246
+
247
+
248
+ # private
249
+ def _add_zip(
250
+ link: str, abs_dict: dict[str, DataFrame], abs_meta: DataFrame, **args
251
+ ) -> tuple[dict[str, DataFrame], DataFrame]:
252
+ """Add tables from zip file to the dictionary of DataFrames
253
+ and associated rows to the meta data."""
254
+
255
+ zip_contents = get_file(link, **args)
256
+ if len(zip_contents) == 0:
257
+ return abs_dict, abs_meta
258
+ zip_data, zip_meta = _process_zip_binary(zip_contents, **args)
259
+ abs_dict.update(zip_data)
260
+ abs_meta = pd.concat([abs_meta, zip_meta], axis=0)
261
+ return abs_dict, abs_meta
262
+
263
+
264
+ # private
265
+ def _add_excel(
266
+ link: str,
267
+ abs_dict: dict[str, DataFrame],
268
+ abs_meta: DataFrame,
269
+ **args: Any,
270
+ ) -> tuple[dict[str, DataFrame], DataFrame]:
271
+ """Add a table to the dictionary of DataFrames
272
+ and rows to the the meta data."""
273
+
274
+ name = get_table_name(link)
275
+ if name in abs_dict:
276
+ # table already in the dictionary
277
+ return abs_dict, abs_meta
278
+ raw_bytes = get_file(link, **args)
279
+ if len(raw_bytes) == 0:
280
+ # could not get the file, and errors are ignored
281
+ return abs_dict, abs_meta
282
+ excel_df, file_meta = _extract_data_from_excel(raw_bytes, name, **args)
283
+ if len(excel_df) == 0:
284
+ # could not get the file, and errors are ignored
285
+ return abs_dict, abs_meta
286
+ abs_dict[name] = excel_df
287
+ abs_meta = pd.concat([abs_meta, file_meta], axis=0)
288
+ return abs_dict, abs_meta
289
+
290
+
291
+ # private
292
+ def _add_single(
293
+ name: str,
294
+ abs_dict: dict[str, DataFrame],
295
+ abs_meta: DataFrame,
296
+ links: dict[str, list[str]],
297
+ typology: str, # ".zip" or ".xlsx"
298
+ **args,
299
+ ) -> tuple[dict[str, DataFrame], DataFrame]:
300
+ """Add a single excel file or zip file to the dictionary of DataFrames,
301
+ along with associated meta data."""
302
+
303
+ fn: Callable = _add_zip if typology == ".zip" else _add_excel
304
+ selection = {get_table_name(x): x for x in links.get(typology, [])}
305
+ if name not in selection:
306
+ message = f"File ({name}{typology}) not found on ABS web page."
307
+ if not args["ignore_errors"]:
308
+ raise ValueError(message)
309
+ print(message)
310
+ return abs_dict, abs_meta
311
+ abs_dict, abs_meta = fn(selection[name], abs_dict, abs_meta, **args)
312
+ return abs_dict, abs_meta
313
+
314
+
315
+ # public -- primary entry point for this module
316
+ @cache # minimise slowness with repeat business
317
+ def read_abs_cat(
318
+ cat: str, **kwargs: Any # ABS catalogue number # keyword arguments
319
+ ) -> tuple[dict[str, DataFrame], DataFrame]:
320
+ """Read the ABS data for a catalogue id and return the data.
321
+
322
+ Parameters
323
+ ----------
324
+ cat : str
325
+ The ABS catalogue number.
326
+ **kwargs : Any
327
+ Keyword arguments for the read_abs_cat function.
328
+
329
+ Returns
330
+ -------
331
+ tuple[dict[str, DataFrame], DataFrame]
332
+ A dictionary of DataFrames and a DataFrame of the meta data.
333
+ The dictionary is indexed by table names, which can be found
334
+ in the meta data DataFrame."""
335
+
336
+ # check/get the keyword arguments
337
+ check_kwargs(kwargs, "read_abs_cat")
338
+ args = get_args(kwargs)
339
+
340
+ if (
341
+ not args["get_zip"]
342
+ and not args["get_excel"]
343
+ and not args["get_excel_if_no_zip"]
344
+ ):
345
+ raise ValueError("read_abs_dict: either get_zip or get_excel must be True.")
346
+
347
+ # convert the catalogue number to the ABS webpage URL
348
+ cm = catalogue_map()
349
+ if cat not in cm.index:
350
+ raise ValueError(f"ABS catalogue number {cat} not found.")
351
+ url = cm["URL"].astype(str)[cat]
352
+
353
+ # get the URL links to the relevant ABS data files on that webpage
354
+ links = get_data_links(url, **args)
355
+ if not links:
356
+ print(f"No data files found for catalogue number {cat}")
357
+ return {}, DataFrame() # return an empty dictionary, DataFrame
358
+
359
+ # read the data files into a dictionary of DataFrames
360
+ abs_dict: dict[str, DataFrame] = {}
361
+ abs_meta: DataFrame = DataFrame()
362
+
363
+ if args["single_excel_only"]:
364
+ abs_dict, abs_meta = _add_single(
365
+ args["single_excel_only"], abs_dict, abs_meta, links, ".xlsx", **args
366
+ )
367
+
368
+ elif args["single_zip_only"]:
369
+ abs_dict, abs_meta = _add_single(
370
+ args["single_zip_only"], abs_dict, abs_meta, links, ".zip", **args
371
+ )
372
+
373
+ else:
374
+ for link_type in ".zip", ".xlsx": # .zip must come first
375
+ for link in links.get(link_type, []):
376
+ if link_type == ".zip" and args["get_zip"]:
377
+ abs_dict, abs_meta = _add_zip(link, abs_dict, abs_meta, **args)
378
+
379
+ elif link_type == ".xlsx" and (
380
+ args["get_excel"]
381
+ or (args["get_excel_if_no_zip"] and not args["get_zip"])
382
+ or (args["get_excel_if_no_zip"] and not links.get(".zip", []))
383
+ ):
384
+ abs_dict, abs_meta = _add_excel(
385
+ link, abs_dict, abs_meta, links=links, **args
386
+ )
387
+
388
+ # reset the index of the metadata
389
+ return abs_dict, abs_meta.reset_index()
@@ -0,0 +1,95 @@
1
+ """read_abs_series.py
2
+
3
+ Get specific ABS data series by their ABS series identifiers."""
4
+
5
+ # --- imports
6
+ # system imports
7
+ from typing import Any, Sequence, cast
8
+
9
+ # analytic imports
10
+ from pandas import DataFrame, PeriodIndex, concat
11
+
12
+ # local imports
13
+ if __package__ is None or __package__ == "":
14
+ from read_abs_cat import read_abs_cat
15
+ from read_support import check_kwargs, get_args
16
+ from abs_meta_data_support import metacol
17
+ else:
18
+ from .read_abs_cat import read_abs_cat
19
+ from .read_support import check_kwargs, get_args
20
+ from .abs_meta_data_support import metacol
21
+
22
+
23
+ # --- functions
24
+ def read_abs_series(
25
+ cat: str,
26
+ series_id: str | Sequence[str],
27
+ **kwargs: Any,
28
+ ) -> tuple[DataFrame, DataFrame]:
29
+ """Get specific ABS data series by their ABS catalogue ID and series ID
30
+
31
+ Parameters
32
+ ----------
33
+ cat : str
34
+ The ABS catalogue ID.
35
+ series_id : str | Sequence[str]
36
+ An ABS series ID or a sequence of ABS series IDs.
37
+ **kwargs : Any
38
+ Keyword arguments for the read_abs_series function,
39
+ which are the same as the keyword arguments for the r
40
+ read_abs_cat function.
41
+
42
+ Returns
43
+ -------
44
+ tuple[DataFrame, DataFrame]
45
+ The ABS series data and the associated meta data.
46
+ """
47
+
48
+ # check for unexpected keyword arguments/get defaults
49
+ check_kwargs(kwargs, "read_abs_series")
50
+ args = get_args(kwargs)
51
+
52
+ # read the ABS category data
53
+ cat_data, cat_meta = read_abs_cat(cat, **args)
54
+
55
+ # drop repeated series_ids in the meta data, make series_ids the index
56
+ cat_meta.index = cat_meta[metacol.id]
57
+ cat_meta = cat_meta.groupby(cat_meta.index).first()
58
+
59
+ # get the ABS series data
60
+ if isinstance(series_id, str):
61
+ series_id = [series_id]
62
+ return_data, return_meta = DataFrame(), DataFrame()
63
+ for identifier in series_id:
64
+
65
+ # confirm that the series ID is in the catalogue
66
+ if not identifier in cat_meta.index:
67
+ if args["verbose"]:
68
+ print(f"Series ID {identifier} not found in ABS catalogue ID {cat}")
69
+ if args["ignore_errors"]:
70
+ continue
71
+ raise ValueError(f"Series ID {identifier} not found in catalogue {cat}")
72
+
73
+ # confirm thay the index of the series is compatible
74
+ table = cat_meta.loc[identifier, metacol.table]
75
+ data_series = cat_data[table][identifier]
76
+ if (
77
+ len(return_data) > 0
78
+ and cast(PeriodIndex, return_data.index).freq
79
+ != cast(PeriodIndex, data_series.index).freq
80
+ ):
81
+ if args["verbose"]:
82
+ print(f"Frequency mismatch for series ID {identifier}")
83
+ if args["ignore_errors"]:
84
+ continue
85
+ raise ValueError(f"Frequency mismatch for series ID {identifier}")
86
+
87
+ # add the series data and meta data to the return values
88
+ if len(return_data) > 0:
89
+ return_data = return_data.reindex(
90
+ return_data.index.union(data_series.index)
91
+ )
92
+ return_data[identifier] = data_series
93
+ return_meta = concat([return_meta, cat_meta.loc[identifier]], axis=1)
94
+
95
+ return return_data, return_meta.T
@@ -0,0 +1,31 @@
1
+ """Support for the read ABS data functions, all of which take the
2
+ same keyword arguments. This module provides a way to check for
3
+ unexpected keyword arguments and to provide default values for
4
+ those arguments that are not provided."""
5
+
6
+ from typing import Any
7
+
8
+ DEFAULTS: dict[str, Any] = {
9
+ # argument_name: default_value,
10
+ "verbose": False,
11
+ "ignore_errors": False,
12
+ "get_zip": True,
13
+ "get_excel_if_no_zip": True,
14
+ "get_excel": False,
15
+ "single_zip_only": "",
16
+ "single_excel_only": "",
17
+ "history": "",
18
+ }
19
+
20
+
21
+ def check_kwargs(kwargs: dict[str, Any], name: str) -> None:
22
+ """Warn if there are any invalid keyword args."""
23
+ for k in kwargs:
24
+ if k not in DEFAULTS:
25
+ print(f"{name}: Unexpected keyword argument {k}")
26
+
27
+
28
+ def get_args(kwargs: dict[str, Any]) -> dict[str, Any]:
29
+ """Return a dictionary with only the valid kwargs
30
+ (and their default values if a valid key is missing from kwargs)."""
31
+ return {k: kwargs.get(k, v) for k, v in DEFAULTS.items()}
readabs/readabs.py ADDED
@@ -0,0 +1,40 @@
1
+ """Read time series data from the Australian Bureau of Statistics (ABS)."""
2
+
3
+ # --- imports
4
+ # system imports
5
+
6
+ # analytic imports
7
+
8
+ # local imports
9
+ from .abs_catalogue_map import catalogue_map
10
+ from .get_data_links import get_data_links
11
+ from .read_abs_cat import read_abs_cat
12
+ from .read_abs_series import read_abs_series
13
+ from .abs_meta_data_support import metacol
14
+ from .utilities import (
15
+ percent_change,
16
+ annualise_rates,
17
+ annualise_percentages,
18
+ qtly_to_monthly,
19
+ monthly_to_qtly,
20
+ )
21
+
22
+ _ = (
23
+ # silence linters/checkers
24
+ get_data_links,
25
+ metacol,
26
+ read_abs_cat,
27
+ read_abs_series,
28
+ percent_change,
29
+ annualise_rates,
30
+ annualise_percentages,
31
+ qtly_to_monthly,
32
+ monthly_to_qtly,
33
+ )
34
+
35
+
36
+ # --- functions
37
+ def print_abs_catalogue() -> None:
38
+ """Print the ABS catalogue."""
39
+ catalogue = catalogue_map()
40
+ print(catalogue.loc[:, catalogue.columns != "URL"].to_markdown())
readabs/readabs.pyi ADDED
@@ -0,0 +1,26 @@
1
+ """Stubs for readabs."""
2
+
3
+ from typing import Any, Sequence
4
+ from pandas import DataFrame, Series
5
+
6
+
7
+ # TO DO: metacol
8
+
9
+ def catalogue_map() -> DataFrame: ...
10
+ def print_abs_catalogue() -> None: ...
11
+
12
+ def get_data_links(
13
+ url: str, inspect_file_name="", **kwargs: Any,
14
+ ) -> dict[str, list[str]]: ...
15
+
16
+ def read_abs_cat(
17
+ cat: str, **kwargs: Any,
18
+ ) -> tuple[dict[str, DataFrame], DataFrame]: ...
19
+
20
+ def read_abs_series(
21
+ cat: str,
22
+ series_id: str | Sequence[str],
23
+ **kwargs: Any,
24
+ ) -> tuple[DataFrame, DataFrame]: ...
25
+
26
+
readabs/utilities.py ADDED
@@ -0,0 +1,98 @@
1
+ """utilities.py
2
+
3
+ This module provides a small numer of utilities for
4
+ working with ABS timeseries data."""
5
+
6
+ # --- imports
7
+ from typing import TypeVar, Optional, cast
8
+ from pandas import Series, DataFrame, PeriodIndex, DatetimeIndex
9
+ from numpy import nan
10
+
11
+ # - define a useful typevar for working with both Series and DataFrames
12
+ DataT = TypeVar("DataT", Series, DataFrame)
13
+
14
+
15
+ # --- functions
16
+ def percent_change(data: DataT, m_periods: int) -> DataT:
17
+ """Calculate an percentage change in a series over n_periods."""
18
+
19
+ return (data / data.shift(m_periods) - 1) * 100
20
+
21
+
22
+ def annualise_rates(data: DataT, periods_per_year: int | float = 12) -> DataT:
23
+ """Annualise a growth rate for a period.
24
+ Note: returns a percentage (and not a rate)!"""
25
+
26
+ return (((1 + data) ** periods_per_year) - 1) * 100
27
+
28
+
29
+ def annualise_percentages(data: DataT, periods_per_year: int | float = 12) -> DataT:
30
+ """Annualise a growth rate (expressed as a percentage) for a period."""
31
+
32
+ rates = data / 100.0
33
+ return annualise_rates(rates, periods_per_year)
34
+
35
+
36
+ def qtly_to_monthly(
37
+ data: DataT,
38
+ interpolate: bool = True,
39
+ limit: Optional[int] = 2, # only used if interpolate is True
40
+ dropna: bool = True,
41
+ ) -> DataT:
42
+ """Convert a pandas timeseries with a Quarterly PeriodIndex to an
43
+ timeseries with a Monthly PeriodIndex.
44
+
45
+ Arguments:
46
+ ==========
47
+ data - either a pandas Series or DataFrame - assumes the index is unique.
48
+ interpolate - whether to interpolate the missing monthly data.
49
+ dropna - whether to drop NA data
50
+
51
+ Notes:
52
+ ======
53
+ Necessitated by Pandas 2.2, which removed .resample()
54
+ from pandas objects with a PeriodIndex."""
55
+
56
+ # sanity checks
57
+ assert isinstance(data.index, PeriodIndex)
58
+ assert data.index.freqstr[0] == "Q"
59
+ assert data.index.is_unique
60
+ assert data.index.is_monotonic_increasing
61
+
62
+ def set_axis_monthly_periods(x: DataT) -> DataT:
63
+ """Convert a DatetimeIndex to a Monthly PeriodIndex."""
64
+
65
+ return x.set_axis(
66
+ labels=cast(DatetimeIndex, x.index).to_period(freq="M"), axis="index"
67
+ )
68
+
69
+ # do the heavy lifting
70
+ data = (
71
+ data.set_axis(
72
+ labels=data.index.to_timestamp(how="end"), axis="index", copy=True
73
+ )
74
+ .resample(rule="ME") # adds in every missing month
75
+ .first(min_count=1) # generates nans for new months
76
+ # assumes only one value per quarter (ie. unique index)
77
+ .pipe(set_axis_monthly_periods)
78
+ )
79
+
80
+ if interpolate:
81
+ data = data.interpolate(limit_area="inside", limit=limit)
82
+ if dropna:
83
+ data = data.dropna()
84
+
85
+ return data
86
+
87
+
88
+ def monthly_to_qtly(data: DataT, q_ending="DEC", f: str = "mean") -> DataT:
89
+ """Convert monthly data to quarterly data by taking the mean of
90
+ the three months in each quarter. Ignore quarters with less than
91
+ three months data. Drop NA items. Change f to "sum" for a quarterly sum"""
92
+
93
+ return (
94
+ data.groupby(PeriodIndex(data.index, freq=f"Q-{q_ending}"))
95
+ .agg([f, "count"])
96
+ .apply(lambda x: x["mean"] if x["count"] == 3 else nan, axis=1)
97
+ .dropna()
98
+ )
@@ -0,0 +1,8 @@
1
+ Copyright 2024 Bryan Palmer (Canberra Australia)
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8
+
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.1
2
+ Name: readabs
3
+ Version: 0.0.2
4
+ Summary: Get ABS timeseries data in pandas DataFrames
5
+ Author-email: Bryan Palmer <palmer.bryan@gmail.com>
6
+ Maintainer-email: Bryan Palmer <palmer.bryan@gmail.com>
7
+ Project-URL: Homepage, https://github.com/bpalmer4/readabs
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ License-File: LICENSE
13
+
@@ -0,0 +1,17 @@
1
+ readabs/__init__.py,sha256=oRPBeNE3YqW2BPHpluXF39OfwCFGrVws2dMTeXyhnhM,68
2
+ readabs/abs_catalogue_map.py,sha256=XrTc844NEV0g6CaxadS0HsCJweOGUQwpRMEdi2I5iBY,8922
3
+ readabs/abs_meta_data_support.py,sha256=PgVOWIGb3_axFwYDnCIZ0IBJ5WUTtBIfNwkdjFfQ9zs,705
4
+ readabs/download_cache.py,sha256=h_ElUmdJJuBm5DAB9KefShOWxCvMHskHDwVlV6L5IiE,6960
5
+ readabs/generate_catalogue_map.py,sha256=5Air2d4fvZVezJt9fzUQc7WLX1aHsv2y4Yn0SJtXbRk,2011
6
+ readabs/get_data_links.py,sha256=wLL2p8cZMUVM-PYCoh5XKO2-lt3J9QwEGL17CFaYDq4,4116
7
+ readabs/read_abs_cat.py,sha256=6Sb1meL_NlcjXRB3J7VbjbtDvIlRP5GzKzmKD5ZpUGI,13464
8
+ readabs/read_abs_series.py,sha256=R0ogok9Wm4fgL59ZYcI2TObPLEQjAH5QxRZ2qoUjhmc,3175
9
+ readabs/read_support.py,sha256=AQAvOQ-FlpQBRHrLQHx44OiUDgN1twRPrwVHcekh6Bk,1007
10
+ readabs/readabs.py,sha256=D1zJXYbrc1mipDCp9rKmPjuGx91MLIKlRMHKH-pq0ig,909
11
+ readabs/readabs.pyi,sha256=N6psbIWpV5obaltk0BbFl2Eip0LUaPN0hCFzhr5wQXs,524
12
+ readabs/utilities.py,sha256=-L2kSe-141l-8s5fKj-bSPxs7o5VKFDU5JKaqU5rGDU,3124
13
+ readabs-0.0.2.dist-info/LICENSE,sha256=YMg097MHV-y9Yg1sZK7T9nueRGswD4cEcCRtst9FGxE,1082
14
+ readabs-0.0.2.dist-info/METADATA,sha256=cLZ2HVbv1q19NV_89nhnftjpdmuo8BDrtcir7XaNkiA,464
15
+ readabs-0.0.2.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
16
+ readabs-0.0.2.dist-info/top_level.txt,sha256=lA7BwCI3L6fvTyx0HcMTcS3FhgXkCiEL3sXUQ2WtLbE,8
17
+ readabs-0.0.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (70.1.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ readabs