mergeron 2025.739319.3__py3-none-any.whl → 2025.739341.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mergeron might be problematic. Click here for more details.

mergeron/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Variables, types, objects and functions used throughout the package."""
2
+
1
3
  from __future__ import annotations
2
4
 
3
5
  import enum
@@ -6,13 +8,14 @@ from multiprocessing import cpu_count
6
8
  from pathlib import Path
7
9
  from typing import Any, Literal
8
10
 
11
+ import attrs
9
12
  import numpy as np
10
13
  from numpy.typing import NDArray
11
14
  from ruamel import yaml
12
15
 
13
- _PKG_NAME: str = Path(__file__).parent.stem
16
+ _PKG_NAME: str = Path(__file__).parent.name
14
17
 
15
- VERSION = "2025.739319.3"
18
+ VERSION = "2025.739341.8"
16
19
 
17
20
  __version__ = VERSION
18
21
 
@@ -34,7 +37,7 @@ EMPTY_ARRAYINT = np.array([], int)
34
37
 
35
38
  NTHREADS = 2 * cpu_count()
36
39
 
37
- PKG_ATTRS_MAP: dict[str, object] = {}
40
+ PKG_ATTRS_MAP: dict[str, type] = {}
38
41
 
39
42
  np.set_printoptions(precision=24, floatmode="fixed")
40
43
 
@@ -75,23 +78,21 @@ this_yaml.indent(mapping=2, sequence=4, offset=2)
75
78
  def yaml_rt_mapper(
76
79
  _c: yaml.constructor.RoundTripConstructor, _n: yaml.MappingNode
77
80
  ) -> Mapping[str, Any]:
78
- """
79
- Constructs a mapping from a mapping node with the RoundTripConstructor
80
-
81
- """
81
+ """Construct mapping from a mapping node with the RoundTripConstructor."""
82
82
  data_: Mapping[str, Any] = yaml.constructor.CommentedMap()
83
83
  _c.construct_mapping(_n, maptyp=data_, deep=True)
84
84
  return data_
85
85
 
86
86
 
87
- def yamelize_attrs(
88
- _typ: object, /, *, attr_map: Mapping[str, object] = PKG_ATTRS_MAP
89
- ) -> None:
87
+ def yamelize_attrs(_typ: type, /, *, attr_map: dict[str, type] = PKG_ATTRS_MAP) -> None:
90
88
  """Add yaml representer, constructor for attrs-defined class.
91
89
 
92
- Applying this function, attributes with property, `init=False` are
93
- not serialized to YAML.
90
+ Attributes with property, `init=False` are not serialized/deserialized
91
+ to YAML by the functions defined here. These attributes can, of course,
92
+ be dumped to stand-alone (YAML) representation, and deserialized from there.
94
93
  """
94
+ if not attrs.has(_typ):
95
+ raise ValueError(f"Object {_typ} is not attrs-defined")
95
96
 
96
97
  attr_map |= {_typ.__name__: _typ}
97
98
 
@@ -99,9 +100,6 @@ def yamelize_attrs(
99
100
  _typ,
100
101
  lambda _r, _d: _r.represent_mapping(
101
102
  f"!{_d.__class__.__name__}",
102
- # construct mapping, rather than calling attrs.asdict(),
103
- # to use yaml representers defined in this package for
104
- # "upstream" objects
105
103
  {_a.name: getattr(_d, _a.name) for _a in _d.__attrs_attrs__ if _a.init},
106
104
  ),
107
105
  )
@@ -113,12 +111,13 @@ def yamelize_attrs(
113
111
 
114
112
  @this_yaml.register_class
115
113
  class Enameled(enum.Enum):
116
- """Add YAML representer, constructor for enum.Enum"""
114
+ """Add YAML representer, constructor for enum.Enum."""
117
115
 
118
116
  @classmethod
119
117
  def to_yaml(
120
- cls, _r: yaml.representer.RoundTripRepresenter, _d: object[enum.EnumType]
118
+ cls, _r: yaml.representer.RoundTripRepresenter, _d: enum.Enum
121
119
  ) -> yaml.ScalarNode:
120
+ """Serialize enumerations by .name, not .value."""
122
121
  return _r.represent_scalar(
123
122
  f"!{super().__getattribute__(cls, '__name__')}", f"{_d.name}"
124
123
  )
@@ -126,8 +125,10 @@ class Enameled(enum.Enum):
126
125
  @classmethod
127
126
  def from_yaml(
128
127
  cls, _c: yaml.constructor.RoundTripConstructor, _n: yaml.ScalarNode
129
- ) -> object[enum.EnumType]:
130
- return super().__getattribute__(cls, _n.value)
128
+ ) -> enum.EnumType:
129
+ """Deserialize enumeration."""
130
+ retval: enum.EnumType = super().__getattribute__(cls, _n.value)
131
+ return retval
131
132
 
132
133
 
133
134
  @this_yaml.register_class
@@ -181,10 +182,7 @@ class RECForm(str, Enameled):
181
182
  @this_yaml.register_class
182
183
  @enum.unique
183
184
  class UPPAggrSelector(str, Enameled):
184
- """
185
- Aggregator for GUPPI and diversion ratio estimates.
186
-
187
- """
185
+ """Aggregator for GUPPI and diversion ratio estimates."""
188
186
 
189
187
  AVG = "average"
190
188
  CPA = "cross-product-share weighted average"
mergeron/core/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Constants, types, objects and functions used within this sub-package."""
2
+
1
3
  from __future__ import annotations
2
4
 
3
5
  from collections.abc import Mapping
@@ -8,6 +10,7 @@ from typing import Any
8
10
  import mpmath # type: ignore
9
11
  import numpy as np
10
12
  from attrs import cmp_using, field, frozen
13
+ from numpy.random import PCG64DXSM
11
14
 
12
15
  from .. import ( # noqa: TID252
13
16
  VERSION,
@@ -20,13 +23,15 @@ from .. import ( # noqa: TID252
20
23
 
21
24
  __version__ = VERSION
22
25
 
26
+ DEFAULT_BITGENERATOR = PCG64DXSM
27
+
23
28
  type MPFloat = mpmath.ctx_mp_python.mpf
24
- type MPMatrix = mpmath.matrix # type: ignore
29
+ type MPMatrix = mpmath.matrices.matrices._matrix
25
30
 
26
31
 
27
32
  @frozen
28
33
  class GuidelinesBoundary:
29
- """Output of a Guidelines boundary function."""
34
+ """Represents Guidelines boundary analytically."""
30
35
 
31
36
  coordinates: ArrayDouble
32
37
  """Market-share pairs as Cartesian coordinates of points on the boundary."""
@@ -35,8 +40,19 @@ class GuidelinesBoundary:
35
40
  """Area under the boundary."""
36
41
 
37
42
 
43
+ @frozen
44
+ class GuidelinesBoundaryCallable:
45
+ """A function to generate Guidelines boundary points, along with area and knot."""
46
+
47
+ boundary_function: Callable[[ArrayDouble], ArrayDouble]
48
+ area: float
49
+ s_naught: float = 0
50
+
51
+
38
52
  @frozen
39
53
  class INVTableData:
54
+ """Represents individual table of FTC merger investigations data."""
55
+
40
56
  industry_group: str
41
57
  additional_evidence: str
42
58
  data_array: ArrayBIGINT = field(eq=cmp_using(eq=np.array_equal))
@@ -45,7 +61,7 @@ class INVTableData:
45
61
  type INVData = MappingProxyType[
46
62
  str, MappingProxyType[str, MappingProxyType[str, INVTableData]]
47
63
  ]
48
- type INVData_in = Mapping[str, Mapping[str, Mapping[str, INVTableData]]]
64
+ type INVData_in = dict[str, dict[str, dict[str, INVTableData]]]
49
65
 
50
66
  yamelize_attrs(INVTableData)
51
67
 
@@ -91,14 +107,14 @@ _, _ = (
91
107
 
92
108
  def _dict_from_mapping(_p: Mapping[Any, Any], /) -> dict[Any, Any]:
93
109
  retval: dict[Any, Any] = {}
94
- for _k, _v in _p.items(): # for subit in it:
110
+ for _k, _v in _p.items():
95
111
  retval |= {_k: _dict_from_mapping(_v)} if isinstance(_v, Mapping) else {_k: _v}
96
112
  return retval
97
113
 
98
114
 
99
115
  def _mappingproxy_from_mapping(_p: Mapping[Any, Any], /) -> MappingProxyType[Any, Any]:
100
116
  retval: dict[Any, Any] = {}
101
- for _k, _v in _p.items(): # for subit in it:
117
+ for _k, _v in _p.items():
102
118
  retval |= (
103
119
  {_k: _mappingproxy_from_mapping(_v)}
104
120
  if isinstance(_v, Mapping)
@@ -1,6 +1,4 @@
1
- """
2
- Functions to parse margin data compiled by
3
- Prof. Aswath Damodaran, Stern School of Business, NYU.
1
+ """Functions to parse margin data compiled by Prof. Aswath Damodaran, Stern School of Business, NYU.
4
2
 
5
3
  Provides :func:`margin_data_resampler` for generating margin data
6
4
  from an estimated Gaussian KDE from the source (margin) data.
@@ -10,9 +8,8 @@ Data are downloaded or reused from a local copy, on demand.
10
8
  For terms of use of Prof. Damodaran's data, please see:
11
9
  https://pages.stern.nyu.edu/~adamodar/New_Home_Page/datahistory.html
12
10
 
13
- NOTES
11
+ Notes
14
12
  -----
15
-
16
13
  Prof. Damodaran notes that the data construction may not be
17
14
  consistent from iteration to iteration. He also notes that,
18
15
  "the best use for my data is in real time corporate financial analysis
@@ -36,171 +33,50 @@ price-cost margins fall in the interval :math:`[0, 1]`.
36
33
 
37
34
  """
38
35
 
39
- import shutil
36
+ import datetime
37
+ import os
38
+ import re
40
39
  import zipfile
41
40
  from pathlib import Path
42
41
  from types import MappingProxyType
43
42
 
44
43
  import numpy as np
45
44
  import urllib3
46
- from joblib import Parallel, delayed
47
- from numpy.random import PCG64DXSM, Generator, SeedSequence
45
+ from bs4 import BeautifulSoup
46
+ from joblib import Parallel, delayed # type: ignore
47
+ from numpy.random import Generator, SeedSequence
48
+ from python_calamine import CalamineWorkbook
48
49
  from scipy import stats # type: ignore
49
- from xlrd import open_workbook # type: ignore
50
50
 
51
51
  from .. import NTHREADS, VERSION, ArrayDouble, this_yaml # noqa: TID252
52
52
  from .. import WORK_DIR as PKG_WORK_DIR # noqa: TID252
53
- from .. import data as mdat # noqa: TID252
54
- from . import _mappingproxy_from_mapping
53
+ from . import DEFAULT_BITGENERATOR
55
54
 
56
55
  __version__ = VERSION
57
56
 
58
57
  WORK_DIR = globals().get("WORK_DIR", PKG_WORK_DIR)
59
- """Redefined, in case the user defines WORK_DIR betweeen module imports."""
58
+ """Redefined, in case the user defines WORK_DIR between module imports."""
60
59
 
61
60
  MGNDATA_ARCHIVE_PATH = WORK_DIR / "damodaran_margin_data_serialized.zip"
62
61
 
63
62
  type DamodaranMarginData = MappingProxyType[str, MappingProxyType[str, float | int]]
64
63
 
65
- u3pm = urllib3.PoolManager()
66
-
67
-
68
- def margin_data_getter( # noqa: PLR0912
69
- _table_name: str = "margin",
70
- *,
71
- data_archive_path: Path = MGNDATA_ARCHIVE_PATH,
72
- data_download_flag: bool = False,
73
- ) -> DamodaranMarginData:
74
- if _table_name != "margin": # Not validated for other tables
75
- raise ValueError(
76
- "This code is designed for parsing Prof. Damodaran's margin tables."
77
- )
78
-
79
- data_archive_path = data_archive_path or MGNDATA_ARCHIVE_PATH
80
- workbook_path = data_archive_path.parent / f"damodaran_{_table_name}_data.xls"
81
- if data_archive_path.is_file() and not data_download_flag:
82
- with zipfile.ZipFile(data_archive_path) as _yzip:
83
- margin_data_dict = this_yaml.load(
84
- _yzip.read(data_archive_path.with_suffix(".yaml").name)
85
- )
86
- return margin_data_dict
87
- elif workbook_path.is_file():
88
- workbook_path.unlink()
89
- if data_archive_path.is_file():
90
- data_archive_path.unlink()
91
-
92
- margin_urlstr = (
93
- f"https://pages.stern.nyu.edu/~adamodar/pc/datasets/{_table_name}.xls"
94
- )
95
- try:
96
- chunk_size_ = 1024 * 1024
97
- with (
98
- u3pm.request(
99
- "GET", margin_urlstr, preload_content=False
100
- ) as _urlopen_handle,
101
- workbook_path.open("wb") as margin_file,
102
- ):
103
- while True:
104
- data_ = _urlopen_handle.read(chunk_size_)
105
- if not data_:
106
- break
107
- margin_file.write(data_)
108
-
109
- print(f"Downloaded {margin_urlstr} to {workbook_path}.")
110
-
111
- except urllib3.exceptions.MaxRetryError as error_:
112
- if isinstance(error_.__cause__, urllib3.exceptions.SSLError):
113
- # Works fine with other sites secured with certificates
114
- # from the Internet2 CA, such as,
115
- # https://snap.stanford.edu/data/web-Stanford.txt.gz
116
- print(
117
- f"WARNING: Could not establish secure connection to, {margin_urlstr}."
118
- "Using bundled copy."
119
- )
120
- if not workbook_path.is_file():
121
- shutil.copy2(mdat.DAMODARAN_MARGIN_WORKBOOK, workbook_path)
122
- else:
123
- raise error_
124
-
125
- xl_book = open_workbook(workbook_path, ragged_rows=True, on_demand=True)
126
- xl_sheet = xl_book.sheet_by_name("Industry Averages")
127
-
128
- margin_dict_in: dict[str, dict[str, float | int]] = {}
129
- row_keys: list[str] = []
130
- read_row_flag = False
131
- for _ridx in range(xl_sheet.nrows):
132
- xl_row = xl_sheet.row_values(_ridx)
133
- if xl_row[0] == "Industry Name":
134
- read_row_flag = True
135
- row_keys = xl_row
136
- continue
137
-
138
- if not xl_row[0] or not read_row_flag:
139
- continue
140
-
141
- xl_row[1] = int(xl_row[1])
142
- margin_dict_in[xl_row[0]] = dict(zip(row_keys[1:], xl_row[1:], strict=True))
143
-
144
- margin_dict = _mappingproxy_from_mapping(margin_dict_in)
145
- with (
146
- zipfile.ZipFile(data_archive_path, "w") as _yzip,
147
- _yzip.open(f"{data_archive_path.stem}.yaml", "w") as _yfh,
148
- ):
149
- this_yaml.dump(margin_dict, _yfh)
150
-
151
- return margin_dict
152
-
153
-
154
- def margin_data_builder(
155
- _src_data_dict: DamodaranMarginData | None = None, /
156
- ) -> tuple[ArrayDouble, ArrayDouble]:
157
- if _src_data_dict is None:
158
- _src_data_dict = margin_data_getter()
159
-
160
- margin_data_wts, margin_data_obs = (
161
- _f.flatten()
162
- for _f in np.hsplit(
163
- np.array([
164
- tuple(
165
- _src_data_dict[_g][_h] for _h in ["Number of firms", "Gross Margin"]
166
- )
167
- for _g in _src_data_dict
168
- if not _g.startswith("Total Market")
169
- and _g
170
- not in {
171
- "Bank (Money Center)",
172
- "Banks (Regional)",
173
- "Brokerage & Investment Banking",
174
- "Financial Svcs. (Non-bank & Insurance)",
175
- "Insurance (General)",
176
- "Insurance (Life)",
177
- "Insurance (Prop/Cas.)",
178
- "Investments & Asset Management",
179
- "R.E.I.T.",
180
- "Retail (REITs)",
181
- "Reinsurance",
182
- }
183
- ]),
184
- 2,
185
- )
186
- )
187
-
188
- margin_wtd_avg = np.average(margin_data_obs, weights=margin_data_wts)
189
- # https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf
190
- margin_wtd_stderr = np.sqrt(
191
- np.average((margin_data_obs - margin_wtd_avg) ** 2, weights=margin_data_wts)
192
- * (len(margin_data_wts) / (len(margin_data_wts) - 1))
193
- )
194
-
195
- return np.stack([margin_data_obs, margin_data_wts], axis=1, dtype=float), np.round(
196
- (
197
- margin_wtd_avg,
198
- margin_wtd_stderr,
199
- margin_data_obs.min(),
200
- margin_data_obs.max(),
201
- ),
202
- 8,
64
+ FINANCIAL_INDUSTRIES = {
65
+ _i.upper()
66
+ for _i in (
67
+ "Bank (Money Center)",
68
+ "Banks (Regional)",
69
+ "Brokerage & Investment Banking",
70
+ "Financial Svcs. (Non-bank & Insurance)",
71
+ "Insurance (General)",
72
+ "Insurance (Life)",
73
+ "Insurance (Prop/Cas.)",
74
+ "Investments & Asset Management",
75
+ "R.E.I.T.",
76
+ "Retail (REITs)",
77
+ "Reinsurance",
203
78
  )
79
+ }
204
80
 
205
81
 
206
82
  def margin_data_resampler(
@@ -211,8 +87,7 @@ def margin_data_resampler(
211
87
  seed_sequence: SeedSequence | None = None,
212
88
  nthreads: int = NTHREADS,
213
89
  ) -> ArrayDouble:
214
- """
215
- Generate draws from the empirical distribution bassed on Prof. Damodaran's margin data.
90
+ """Generate draws from the empirical distribution based on Prof. Damodaran's margin data.
216
91
 
217
92
  The empirical distribution is estimated using a Gaussian KDE; the bandwidth
218
93
  selected using Silverman's rule is narrowed to reflect that the margin data
@@ -222,7 +97,6 @@ def margin_data_resampler(
222
97
 
223
98
  Parameters
224
99
  ----------
225
-
226
100
  _dist_parms
227
101
  Array of margins and firm counts extracted from Prof. Damodaran's margin data
228
102
 
@@ -235,12 +109,12 @@ def margin_data_resampler(
235
109
 
236
110
  nthreads
237
111
  Number of threads to use in generating margin data.
112
+
238
113
  Returns
239
114
  -------
240
115
  Array of margin values
241
116
 
242
117
  """
243
-
244
118
  _dist_parms = margin_data_builder()[0] if _dist_parms is None else _dist_parms
245
119
 
246
120
  _seed = seed_sequence or SeedSequence(pool_size=8)
@@ -248,23 +122,204 @@ def margin_data_resampler(
248
122
  _x, _w = _dist_parms[:, 0], _dist_parms[:, 1]
249
123
 
250
124
  margin_kde = stats.gaussian_kde(_x, weights=_w, bw_method="silverman")
125
+ # preserve multiplicity of modes:
251
126
  margin_kde.set_bandwidth(bw_method=margin_kde.factor / 3.0)
252
127
 
253
128
  if isinstance(sample_size, int):
254
- return margin_kde.resample(sample_size, seed=Generator(PCG64DXSM(_seed))).T
129
+ return margin_kde.resample(
130
+ sample_size, seed=Generator(DEFAULT_BITGENERATOR(_seed))
131
+ ).T
255
132
 
256
133
  elif isinstance(sample_size, tuple) and len(sample_size) == 2:
257
134
  ret_array = np.empty(sample_size, float)
258
135
 
259
136
  _ssz, _ncol = sample_size
260
- dat_list = Parallel(n_jobs=min(nthreads, _ncol), prefer="threads")(
261
- delayed(margin_kde.resample)(_ssz, seed=Generator(PCG64DXSM(_col_seed)))
137
+ _threads = min(nthreads, _ncol)
138
+ dat_list = Parallel(n_jobs=_threads, prefer="threads")(
139
+ delayed(margin_kde.resample)(
140
+ _ssz, seed=Generator(DEFAULT_BITGENERATOR(_col_seed))
141
+ )
262
142
  for _col_seed in _seed.spawn(_ncol)
263
143
  )
264
144
 
265
145
  for _i in range(_ncol):
266
- ret_array[:, [_i]] = dat_list[_i].T
146
+ ret_array[:, _i] = dat_list[_i][0]
267
147
 
268
148
  return ret_array
269
149
  else:
270
150
  raise ValueError(f"Invalid sample size: {sample_size!r}")
151
+
152
+
153
+ def margin_data_builder(
154
+ _margin_data_dict: DamodaranMarginData | None = None,
155
+ ) -> tuple[ArrayDouble, ArrayDouble]:
156
+ """Derive average firm-counts and gross-margins by industry from source data."""
157
+ _margin_data_dict = (
158
+ margin_data_getter() if _margin_data_dict is None else _margin_data_dict
159
+ )
160
+ dmd_keys = set()
161
+ for _k, _v in _margin_data_dict.items():
162
+ dmd_keys.update(set(_v.keys()))
163
+ dmd_keys = sorted(dmd_keys)
164
+
165
+ dist_parms = np.array([np.nan, np.nan], dtype=float)
166
+ for _sk in dmd_keys:
167
+ if _sk in FINANCIAL_INDUSTRIES or _sk.startswith("TOTAL"):
168
+ continue
169
+
170
+ _missing = {"GROSS MARGIN": 0.0, "NUMBER OF FIRMS": 0.0}
171
+ gm, fc = zip(*[
172
+ [_v.get(_sk, _missing).get(_f) for _f in _missing]
173
+ for _k, _v in _margin_data_dict.items()
174
+ ])
175
+
176
+ average_margin, firm_count = np.array(gm, float), np.array(fc, int)
177
+ # print(firm_count, average_margin)
178
+ dist_parms = np.vstack((
179
+ dist_parms,
180
+ np.array((
181
+ np.average(
182
+ average_margin, weights=(average_margin > 0) * (firm_count > 0)
183
+ ),
184
+ np.average(firm_count, weights=(average_margin > 0) * (firm_count > 0)),
185
+ )),
186
+ ))
187
+
188
+ dist_parms = dist_parms[1:, :]
189
+
190
+ obs_, wts_ = (dist_parms[:, _f] for _f in range(2))
191
+
192
+ avg_gm, num_firms = np.average(obs_, weights=wts_, returned=True)
193
+ std_gm = np.sqrt(
194
+ np.average((obs_ - avg_gm) ** 2, weights=wts_)
195
+ * num_firms
196
+ * len(obs_)
197
+ / ((num_firms - len(obs_)) * (len(obs_) - 1))
198
+ )
199
+
200
+ return dist_parms, np.array([avg_gm, std_gm, obs_.min(), obs_.max()], float)
201
+
202
+
203
+ def margin_data_getter(
204
+ *, data_archive_path: Path | None = None, data_download_flag: bool = False
205
+ ) -> DamodaranMarginData:
206
+ """Download and parse Prof.Damodaran's margin data."""
207
+ data_archive_path = (
208
+ MGNDATA_ARCHIVE_PATH if data_archive_path is None else data_archive_path
209
+ )
210
+ if data_archive_path.is_file() and not data_download_flag:
211
+ with zipfile.ZipFile(data_archive_path) as _yzp:
212
+ margin_data_dict: DamodaranMarginData = this_yaml.load(
213
+ _yzp.read(data_archive_path.with_suffix(".yaml").name)
214
+ )
215
+ return margin_data_dict
216
+
217
+ # Get workbooks from source
218
+ elif data_download_flag or not list(data_archive_path.glob("margin*.xls")):
219
+ margin_data_downloader()
220
+
221
+ # Whitespace cleanup
222
+ ws_pat = re.compile(r"\s+")
223
+
224
+ # Parse workbooks and save margin data dictionary
225
+ margin_data_dict = {}
226
+ for _p in (WORK_DIR / "damodaran_margin_data_archive").iterdir():
227
+ xl_wbk = CalamineWorkbook.from_path(_p)
228
+ xl_wks = xl_wbk.get_sheet_by_index(
229
+ 0 if (_p.stem != "margin" and int(_p.stem[-2:]) in {17, 18, 19}) else 1
230
+ ).to_python()
231
+ if xl_wks[8][2] != "Gross Margin":
232
+ raise ValueError("Worksheet does not match expected layout.")
233
+
234
+ update = xl_wks[0][1].isoformat()[:10]
235
+ margin_data_annual = margin_data_dict.setdefault(update, {})
236
+ row_keys: list[str] = []
237
+ read_row_flag = False
238
+ for xl_row in xl_wks:
239
+ row_key = _s.upper() if isinstance((_s := xl_row[0]), str) else ""
240
+
241
+ if ws_pat.sub(" ", row_key) == "INDUSTRY NAME":
242
+ read_row_flag = True
243
+ row_keys = [_c.upper() for _c in xl_row]
244
+ continue
245
+ elif not read_row_flag or not row_key or row_key.startswith("TOTAL"):
246
+ continue
247
+ else:
248
+ xl_row[1] = int(xl_row[1])
249
+ margin_data_annual |= MappingProxyType({
250
+ row_key: MappingProxyType(
251
+ dict(zip(row_keys[1:], xl_row[1:], strict=True))
252
+ )
253
+ })
254
+
255
+ damodaran_margin_data = MappingProxyType(margin_data_dict)
256
+ with (
257
+ zipfile.ZipFile(data_archive_path, "w") as _yzp,
258
+ _yzp.open(f"{data_archive_path.stem}.yaml", "w") as _yfh,
259
+ ):
260
+ this_yaml.dump(damodaran_margin_data, _yfh)
261
+
262
+ return damodaran_margin_data
263
+
264
+
265
+ def margin_data_downloader() -> DamodaranMarginData:
266
+ """Download Prof.Damodaran's margin data."""
267
+ _u3pm = urllib3.PoolManager(ca_certs=certifi.where())
268
+ _data_source_url = "https://pages.stern.nyu.edu/~adamodar/pc/datasets/"
269
+ _archive_source_url = "https://pages.stern.nyu.edu/~adamodar/pc/archives/"
270
+
271
+ dest_dir = WORK_DIR / "damodaran_margin_data_archive"
272
+
273
+ # Get current-year margin data
274
+ workbook_name = "margin.xls"
275
+ workbook_path = dest_dir / workbook_name
276
+ if workbook_path.is_file():
277
+ workbook_path.unlink()
278
+
279
+ u3pm = urllib3.PoolManager(ca_certs=certifi.where())
280
+ download_file(u3pm, f"{_data_source_url}{workbook_name}", workbook_path)
281
+
282
+ # Get archived margin data
283
+ workbook_re = re.compile(r"margin(\d{2}).xls")
284
+ archive_html = _u3pm.request("GET", _archive_source_url).data.decode("utf-8")
285
+ archive_tree = BeautifulSoup(archive_html, "lxml")
286
+ for tag in archive_tree.find_all("a"):
287
+ if (
288
+ (_r := workbook_re.fullmatch(_w := tag.get("href", "")))
289
+ and int(_r[1]) > 16
290
+ and int(_r[1]) not in {98, 99}
291
+ ):
292
+ _url, _path = f"{_archive_source_url}{_w}", dest_dir / _w
293
+ if _path.is_file():
294
+ _path.unlink()
295
+
296
+ download_file(_u3pm, _url, _path)
297
+
298
+
299
+ def download_file(_u3pm: urllib3.PoolManager, _url: str, _path: Path) -> None:
300
+ """Download a a binary file from URL to filesystem path."""
301
+ chunk_size_ = 1024 * 1024
302
+ with (
303
+ _u3pm.request("GET", _url, preload_content=False) as _uh,
304
+ _path.open("wb") as _fh,
305
+ ):
306
+ while True:
307
+ data_ = _uh.read(chunk_size_)
308
+ if not data_:
309
+ break
310
+ _fh.write(data_)
311
+ os.utime(
312
+ _path,
313
+ times=(
314
+ (
315
+ _t := datetime.datetime.strptime(
316
+ _uh.headers["Last-Modified"], "%a, %d %b %Y %H:%M:%S %Z"
317
+ )
318
+ .astimezone(datetime.UTC)
319
+ .timestamp()
320
+ ),
321
+ _t,
322
+ ),
323
+ )
324
+
325
+ print(f"Downloaded {_url} to {_path}.")