mergeron 2025.739319.3__py3-none-any.whl → 2025.739341.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mergeron might be problematic. Click here for more details.

mergeron/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Variables, types, objects and functions used throughout the package."""
2
+
1
3
  from __future__ import annotations
2
4
 
3
5
  import enum
@@ -6,13 +8,14 @@ from multiprocessing import cpu_count
6
8
  from pathlib import Path
7
9
  from typing import Any, Literal
8
10
 
11
+ import attrs
9
12
  import numpy as np
10
13
  from numpy.typing import NDArray
11
14
  from ruamel import yaml
12
15
 
13
- _PKG_NAME: str = Path(__file__).parent.stem
16
+ _PKG_NAME: str = Path(__file__).parent.name
14
17
 
15
- VERSION = "2025.739319.3"
18
+ VERSION = "2025.739341.9"
16
19
 
17
20
  __version__ = VERSION
18
21
 
@@ -34,7 +37,7 @@ EMPTY_ARRAYINT = np.array([], int)
34
37
 
35
38
  NTHREADS = 2 * cpu_count()
36
39
 
37
- PKG_ATTRS_MAP: dict[str, object] = {}
40
+ PKG_ATTRS_MAP: dict[str, type] = {}
38
41
 
39
42
  np.set_printoptions(precision=24, floatmode="fixed")
40
43
 
@@ -75,23 +78,21 @@ this_yaml.indent(mapping=2, sequence=4, offset=2)
75
78
  def yaml_rt_mapper(
76
79
  _c: yaml.constructor.RoundTripConstructor, _n: yaml.MappingNode
77
80
  ) -> Mapping[str, Any]:
78
- """
79
- Constructs a mapping from a mapping node with the RoundTripConstructor
80
-
81
- """
81
+ """Construct mapping from a mapping node with the RoundTripConstructor."""
82
82
  data_: Mapping[str, Any] = yaml.constructor.CommentedMap()
83
83
  _c.construct_mapping(_n, maptyp=data_, deep=True)
84
84
  return data_
85
85
 
86
86
 
87
- def yamelize_attrs(
88
- _typ: object, /, *, attr_map: Mapping[str, object] = PKG_ATTRS_MAP
89
- ) -> None:
87
+ def yamelize_attrs(_typ: type, /, *, attr_map: dict[str, type] = PKG_ATTRS_MAP) -> None:
90
88
  """Add yaml representer, constructor for attrs-defined class.
91
89
 
92
- Applying this function, attributes with property, `init=False` are
93
- not serialized to YAML.
90
+ Attributes with property, `init=False` are not serialized/deserialized
91
+ to YAML by the functions defined here. These attributes can, of course,
92
+ be dumped to stand-alone (YAML) representation, and deserialized from there.
94
93
  """
94
+ if not attrs.has(_typ):
95
+ raise ValueError(f"Object {_typ} is not attrs-defined")
95
96
 
96
97
  attr_map |= {_typ.__name__: _typ}
97
98
 
@@ -99,9 +100,6 @@ def yamelize_attrs(
99
100
  _typ,
100
101
  lambda _r, _d: _r.represent_mapping(
101
102
  f"!{_d.__class__.__name__}",
102
- # construct mapping, rather than calling attrs.asdict(),
103
- # to use yaml representers defined in this package for
104
- # "upstream" objects
105
103
  {_a.name: getattr(_d, _a.name) for _a in _d.__attrs_attrs__ if _a.init},
106
104
  ),
107
105
  )
@@ -113,12 +111,13 @@ def yamelize_attrs(
113
111
 
114
112
  @this_yaml.register_class
115
113
  class Enameled(enum.Enum):
116
- """Add YAML representer, constructor for enum.Enum"""
114
+ """Add YAML representer, constructor for enum.Enum."""
117
115
 
118
116
  @classmethod
119
117
  def to_yaml(
120
- cls, _r: yaml.representer.RoundTripRepresenter, _d: object[enum.EnumType]
118
+ cls, _r: yaml.representer.RoundTripRepresenter, _d: enum.Enum
121
119
  ) -> yaml.ScalarNode:
120
+ """Serialize enumerations by .name, not .value."""
122
121
  return _r.represent_scalar(
123
122
  f"!{super().__getattribute__(cls, '__name__')}", f"{_d.name}"
124
123
  )
@@ -126,8 +125,10 @@ class Enameled(enum.Enum):
126
125
  @classmethod
127
126
  def from_yaml(
128
127
  cls, _c: yaml.constructor.RoundTripConstructor, _n: yaml.ScalarNode
129
- ) -> object[enum.EnumType]:
130
- return super().__getattribute__(cls, _n.value)
128
+ ) -> enum.EnumType:
129
+ """Deserialize enumeration."""
130
+ retval: enum.EnumType = super().__getattribute__(cls, _n.value)
131
+ return retval
131
132
 
132
133
 
133
134
  @this_yaml.register_class
@@ -181,10 +182,7 @@ class RECForm(str, Enameled):
181
182
  @this_yaml.register_class
182
183
  @enum.unique
183
184
  class UPPAggrSelector(str, Enameled):
184
- """
185
- Aggregator for GUPPI and diversion ratio estimates.
186
-
187
- """
185
+ """Aggregator for GUPPI and diversion ratio estimates."""
188
186
 
189
187
  AVG = "average"
190
188
  CPA = "cross-product-share weighted average"
mergeron/core/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Constants, types, objects and functions used within this sub-package."""
2
+
1
3
  from __future__ import annotations
2
4
 
3
5
  from collections.abc import Mapping
@@ -8,6 +10,7 @@ from typing import Any
8
10
  import mpmath # type: ignore
9
11
  import numpy as np
10
12
  from attrs import cmp_using, field, frozen
13
+ from numpy.random import PCG64DXSM
11
14
 
12
15
  from .. import ( # noqa: TID252
13
16
  VERSION,
@@ -20,13 +23,15 @@ from .. import ( # noqa: TID252
20
23
 
21
24
  __version__ = VERSION
22
25
 
26
+ DEFAULT_BITGENERATOR = PCG64DXSM
27
+
23
28
  type MPFloat = mpmath.ctx_mp_python.mpf
24
- type MPMatrix = mpmath.matrix # type: ignore
29
+ type MPMatrix = mpmath.matrices.matrices._matrix
25
30
 
26
31
 
27
32
  @frozen
28
33
  class GuidelinesBoundary:
29
- """Output of a Guidelines boundary function."""
34
+ """Represents Guidelines boundary analytically."""
30
35
 
31
36
  coordinates: ArrayDouble
32
37
  """Market-share pairs as Cartesian coordinates of points on the boundary."""
@@ -35,8 +40,19 @@ class GuidelinesBoundary:
35
40
  """Area under the boundary."""
36
41
 
37
42
 
43
+ @frozen
44
+ class GuidelinesBoundaryCallable:
45
+ """A function to generate Guidelines boundary points, along with area and knot."""
46
+
47
+ boundary_function: Callable[[ArrayDouble], ArrayDouble]
48
+ area: float
49
+ s_naught: float = 0
50
+
51
+
38
52
  @frozen
39
53
  class INVTableData:
54
+ """Represents individual table of FTC merger investigations data."""
55
+
40
56
  industry_group: str
41
57
  additional_evidence: str
42
58
  data_array: ArrayBIGINT = field(eq=cmp_using(eq=np.array_equal))
@@ -45,7 +61,7 @@ class INVTableData:
45
61
  type INVData = MappingProxyType[
46
62
  str, MappingProxyType[str, MappingProxyType[str, INVTableData]]
47
63
  ]
48
- type INVData_in = Mapping[str, Mapping[str, Mapping[str, INVTableData]]]
64
+ type INVData_in = dict[str, dict[str, dict[str, INVTableData]]]
49
65
 
50
66
  yamelize_attrs(INVTableData)
51
67
 
@@ -91,14 +107,14 @@ _, _ = (
91
107
 
92
108
  def _dict_from_mapping(_p: Mapping[Any, Any], /) -> dict[Any, Any]:
93
109
  retval: dict[Any, Any] = {}
94
- for _k, _v in _p.items(): # for subit in it:
110
+ for _k, _v in _p.items():
95
111
  retval |= {_k: _dict_from_mapping(_v)} if isinstance(_v, Mapping) else {_k: _v}
96
112
  return retval
97
113
 
98
114
 
99
115
  def _mappingproxy_from_mapping(_p: Mapping[Any, Any], /) -> MappingProxyType[Any, Any]:
100
116
  retval: dict[Any, Any] = {}
101
- for _k, _v in _p.items(): # for subit in it:
117
+ for _k, _v in _p.items():
102
118
  retval |= (
103
119
  {_k: _mappingproxy_from_mapping(_v)}
104
120
  if isinstance(_v, Mapping)
@@ -1,6 +1,4 @@
1
- """
2
- Functions to parse margin data compiled by
3
- Prof. Aswath Damodaran, Stern School of Business, NYU.
1
+ """Functions to parse margin data compiled by Prof. Aswath Damodaran, Stern School of Business, NYU.
4
2
 
5
3
  Provides :func:`margin_data_resampler` for generating margin data
6
4
  from an estimated Gaussian KDE from the source (margin) data.
@@ -10,9 +8,8 @@ Data are downloaded or reused from a local copy, on demand.
10
8
  For terms of use of Prof. Damodaran's data, please see:
11
9
  https://pages.stern.nyu.edu/~adamodar/New_Home_Page/datahistory.html
12
10
 
13
- NOTES
11
+ Notes
14
12
  -----
15
-
16
13
  Prof. Damodaran notes that the data construction may not be
17
14
  consistent from iteration to iteration. He also notes that,
18
15
  "the best use for my data is in real time corporate financial analysis
@@ -36,171 +33,51 @@ price-cost margins fall in the interval :math:`[0, 1]`.
36
33
 
37
34
  """
38
35
 
39
- import shutil
36
+ import datetime
37
+ import os
38
+ import re
40
39
  import zipfile
41
40
  from pathlib import Path
42
41
  from types import MappingProxyType
43
42
 
43
+ import certifi
44
44
  import numpy as np
45
45
  import urllib3
46
- from joblib import Parallel, delayed
47
- from numpy.random import PCG64DXSM, Generator, SeedSequence
46
+ from bs4 import BeautifulSoup
47
+ from joblib import Parallel, delayed # type: ignore
48
+ from numpy.random import Generator, SeedSequence
49
+ from python_calamine import CalamineWorkbook
48
50
  from scipy import stats # type: ignore
49
- from xlrd import open_workbook # type: ignore
50
51
 
51
52
  from .. import NTHREADS, VERSION, ArrayDouble, this_yaml # noqa: TID252
52
53
  from .. import WORK_DIR as PKG_WORK_DIR # noqa: TID252
53
- from .. import data as mdat # noqa: TID252
54
- from . import _mappingproxy_from_mapping
54
+ from . import DEFAULT_BITGENERATOR
55
55
 
56
56
  __version__ = VERSION
57
57
 
58
58
  WORK_DIR = globals().get("WORK_DIR", PKG_WORK_DIR)
59
- """Redefined, in case the user defines WORK_DIR betweeen module imports."""
59
+ """Redefined, in case the user defines WORK_DIR between module imports."""
60
60
 
61
61
  MGNDATA_ARCHIVE_PATH = WORK_DIR / "damodaran_margin_data_serialized.zip"
62
62
 
63
63
  type DamodaranMarginData = MappingProxyType[str, MappingProxyType[str, float | int]]
64
64
 
65
- u3pm = urllib3.PoolManager()
66
-
67
-
68
- def margin_data_getter( # noqa: PLR0912
69
- _table_name: str = "margin",
70
- *,
71
- data_archive_path: Path = MGNDATA_ARCHIVE_PATH,
72
- data_download_flag: bool = False,
73
- ) -> DamodaranMarginData:
74
- if _table_name != "margin": # Not validated for other tables
75
- raise ValueError(
76
- "This code is designed for parsing Prof. Damodaran's margin tables."
77
- )
78
-
79
- data_archive_path = data_archive_path or MGNDATA_ARCHIVE_PATH
80
- workbook_path = data_archive_path.parent / f"damodaran_{_table_name}_data.xls"
81
- if data_archive_path.is_file() and not data_download_flag:
82
- with zipfile.ZipFile(data_archive_path) as _yzip:
83
- margin_data_dict = this_yaml.load(
84
- _yzip.read(data_archive_path.with_suffix(".yaml").name)
85
- )
86
- return margin_data_dict
87
- elif workbook_path.is_file():
88
- workbook_path.unlink()
89
- if data_archive_path.is_file():
90
- data_archive_path.unlink()
91
-
92
- margin_urlstr = (
93
- f"https://pages.stern.nyu.edu/~adamodar/pc/datasets/{_table_name}.xls"
94
- )
95
- try:
96
- chunk_size_ = 1024 * 1024
97
- with (
98
- u3pm.request(
99
- "GET", margin_urlstr, preload_content=False
100
- ) as _urlopen_handle,
101
- workbook_path.open("wb") as margin_file,
102
- ):
103
- while True:
104
- data_ = _urlopen_handle.read(chunk_size_)
105
- if not data_:
106
- break
107
- margin_file.write(data_)
108
-
109
- print(f"Downloaded {margin_urlstr} to {workbook_path}.")
110
-
111
- except urllib3.exceptions.MaxRetryError as error_:
112
- if isinstance(error_.__cause__, urllib3.exceptions.SSLError):
113
- # Works fine with other sites secured with certificates
114
- # from the Internet2 CA, such as,
115
- # https://snap.stanford.edu/data/web-Stanford.txt.gz
116
- print(
117
- f"WARNING: Could not establish secure connection to, {margin_urlstr}."
118
- "Using bundled copy."
119
- )
120
- if not workbook_path.is_file():
121
- shutil.copy2(mdat.DAMODARAN_MARGIN_WORKBOOK, workbook_path)
122
- else:
123
- raise error_
124
-
125
- xl_book = open_workbook(workbook_path, ragged_rows=True, on_demand=True)
126
- xl_sheet = xl_book.sheet_by_name("Industry Averages")
127
-
128
- margin_dict_in: dict[str, dict[str, float | int]] = {}
129
- row_keys: list[str] = []
130
- read_row_flag = False
131
- for _ridx in range(xl_sheet.nrows):
132
- xl_row = xl_sheet.row_values(_ridx)
133
- if xl_row[0] == "Industry Name":
134
- read_row_flag = True
135
- row_keys = xl_row
136
- continue
137
-
138
- if not xl_row[0] or not read_row_flag:
139
- continue
140
-
141
- xl_row[1] = int(xl_row[1])
142
- margin_dict_in[xl_row[0]] = dict(zip(row_keys[1:], xl_row[1:], strict=True))
143
-
144
- margin_dict = _mappingproxy_from_mapping(margin_dict_in)
145
- with (
146
- zipfile.ZipFile(data_archive_path, "w") as _yzip,
147
- _yzip.open(f"{data_archive_path.stem}.yaml", "w") as _yfh,
148
- ):
149
- this_yaml.dump(margin_dict, _yfh)
150
-
151
- return margin_dict
152
-
153
-
154
- def margin_data_builder(
155
- _src_data_dict: DamodaranMarginData | None = None, /
156
- ) -> tuple[ArrayDouble, ArrayDouble]:
157
- if _src_data_dict is None:
158
- _src_data_dict = margin_data_getter()
159
-
160
- margin_data_wts, margin_data_obs = (
161
- _f.flatten()
162
- for _f in np.hsplit(
163
- np.array([
164
- tuple(
165
- _src_data_dict[_g][_h] for _h in ["Number of firms", "Gross Margin"]
166
- )
167
- for _g in _src_data_dict
168
- if not _g.startswith("Total Market")
169
- and _g
170
- not in {
171
- "Bank (Money Center)",
172
- "Banks (Regional)",
173
- "Brokerage & Investment Banking",
174
- "Financial Svcs. (Non-bank & Insurance)",
175
- "Insurance (General)",
176
- "Insurance (Life)",
177
- "Insurance (Prop/Cas.)",
178
- "Investments & Asset Management",
179
- "R.E.I.T.",
180
- "Retail (REITs)",
181
- "Reinsurance",
182
- }
183
- ]),
184
- 2,
185
- )
186
- )
187
-
188
- margin_wtd_avg = np.average(margin_data_obs, weights=margin_data_wts)
189
- # https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf
190
- margin_wtd_stderr = np.sqrt(
191
- np.average((margin_data_obs - margin_wtd_avg) ** 2, weights=margin_data_wts)
192
- * (len(margin_data_wts) / (len(margin_data_wts) - 1))
193
- )
194
-
195
- return np.stack([margin_data_obs, margin_data_wts], axis=1, dtype=float), np.round(
196
- (
197
- margin_wtd_avg,
198
- margin_wtd_stderr,
199
- margin_data_obs.min(),
200
- margin_data_obs.max(),
201
- ),
202
- 8,
65
+ FINANCIAL_INDUSTRIES = {
66
+ _i.upper()
67
+ for _i in (
68
+ "Bank (Money Center)",
69
+ "Banks (Regional)",
70
+ "Brokerage & Investment Banking",
71
+ "Financial Svcs. (Non-bank & Insurance)",
72
+ "Insurance (General)",
73
+ "Insurance (Life)",
74
+ "Insurance (Prop/Cas.)",
75
+ "Investments & Asset Management",
76
+ "R.E.I.T.",
77
+ "Retail (REITs)",
78
+ "Reinsurance",
203
79
  )
80
+ }
204
81
 
205
82
 
206
83
  def margin_data_resampler(
@@ -211,8 +88,7 @@ def margin_data_resampler(
211
88
  seed_sequence: SeedSequence | None = None,
212
89
  nthreads: int = NTHREADS,
213
90
  ) -> ArrayDouble:
214
- """
215
- Generate draws from the empirical distribution bassed on Prof. Damodaran's margin data.
91
+ """Generate draws from the empirical distribution based on Prof. Damodaran's margin data.
216
92
 
217
93
  The empirical distribution is estimated using a Gaussian KDE; the bandwidth
218
94
  selected using Silverman's rule is narrowed to reflect that the margin data
@@ -222,7 +98,6 @@ def margin_data_resampler(
222
98
 
223
99
  Parameters
224
100
  ----------
225
-
226
101
  _dist_parms
227
102
  Array of margins and firm counts extracted from Prof. Damodaran's margin data
228
103
 
@@ -235,12 +110,12 @@ def margin_data_resampler(
235
110
 
236
111
  nthreads
237
112
  Number of threads to use in generating margin data.
113
+
238
114
  Returns
239
115
  -------
240
116
  Array of margin values
241
117
 
242
118
  """
243
-
244
119
  _dist_parms = margin_data_builder()[0] if _dist_parms is None else _dist_parms
245
120
 
246
121
  _seed = seed_sequence or SeedSequence(pool_size=8)
@@ -248,23 +123,204 @@ def margin_data_resampler(
248
123
  _x, _w = _dist_parms[:, 0], _dist_parms[:, 1]
249
124
 
250
125
  margin_kde = stats.gaussian_kde(_x, weights=_w, bw_method="silverman")
126
+ # preserve multiplicity of modes:
251
127
  margin_kde.set_bandwidth(bw_method=margin_kde.factor / 3.0)
252
128
 
253
129
  if isinstance(sample_size, int):
254
- return margin_kde.resample(sample_size, seed=Generator(PCG64DXSM(_seed))).T
130
+ ret_array: ArrayDouble = margin_kde.resample(
131
+ sample_size, seed=Generator(DEFAULT_BITGENERATOR(_seed))
132
+ ).T
255
133
 
256
134
  elif isinstance(sample_size, tuple) and len(sample_size) == 2:
257
135
  ret_array = np.empty(sample_size, float)
258
136
 
259
137
  _ssz, _ncol = sample_size
260
- dat_list = Parallel(n_jobs=min(nthreads, _ncol), prefer="threads")(
261
- delayed(margin_kde.resample)(_ssz, seed=Generator(PCG64DXSM(_col_seed)))
138
+ _threads = min(nthreads, _ncol)
139
+ dat_list = Parallel(n_jobs=_threads, prefer="threads")(
140
+ delayed(margin_kde.resample)(
141
+ _ssz, seed=Generator(DEFAULT_BITGENERATOR(_col_seed))
142
+ )
262
143
  for _col_seed in _seed.spawn(_ncol)
263
144
  )
264
145
 
265
146
  for _i in range(_ncol):
266
- ret_array[:, [_i]] = dat_list[_i].T
267
-
268
- return ret_array
147
+ ret_array[:, _i] = dat_list[_i][0]
269
148
  else:
270
149
  raise ValueError(f"Invalid sample size: {sample_size!r}")
150
+
151
+ return ret_array
152
+
153
+
154
+ def margin_data_builder(
155
+ _margin_data_dict: DamodaranMarginData | None = None,
156
+ ) -> tuple[ArrayDouble, ArrayDouble]:
157
+ """Derive average firm-counts and gross-margins by industry from source data."""
158
+ _margin_data_dict = (
159
+ margin_data_getter() if _margin_data_dict is None else _margin_data_dict
160
+ )
161
+ dmd_keys = set()
162
+ for _k, _v in _margin_data_dict.items():
163
+ dmd_keys.update(set(_v.keys()))
164
+ dmd_keys = sorted(dmd_keys)
165
+
166
+ dist_parms = np.array([np.nan, np.nan], dtype=float)
167
+ for _sk in dmd_keys:
168
+ if _sk in FINANCIAL_INDUSTRIES or _sk.startswith("TOTAL"):
169
+ continue
170
+
171
+ _missing = {"GROSS MARGIN": 0.0, "NUMBER OF FIRMS": 0.0}
172
+ gm, fc = zip(*[
173
+ [_v.get(_sk, _missing).get(_f) for _f in _missing]
174
+ for _k, _v in _margin_data_dict.items()
175
+ ])
176
+
177
+ average_margin, firm_count = np.array(gm, float), np.array(fc, int)
178
+ # print(firm_count, average_margin)
179
+ dist_parms = np.vstack((
180
+ dist_parms,
181
+ np.array((
182
+ np.average(
183
+ average_margin, weights=(average_margin > 0) * (firm_count > 0)
184
+ ),
185
+ np.average(firm_count, weights=(average_margin > 0) * (firm_count > 0)),
186
+ )),
187
+ ))
188
+
189
+ dist_parms = dist_parms[1:, :]
190
+
191
+ obs_, wts_ = (dist_parms[:, _f] for _f in range(2))
192
+
193
+ avg_gm, num_firms = np.average(obs_, weights=wts_, returned=True)
194
+ std_gm = np.sqrt(
195
+ np.average((obs_ - avg_gm) ** 2, weights=wts_)
196
+ * num_firms
197
+ * len(obs_)
198
+ / ((num_firms - len(obs_)) * (len(obs_) - 1))
199
+ )
200
+
201
+ return dist_parms, np.array([avg_gm, std_gm, obs_.min(), obs_.max()], float)
202
+
203
+
204
+ def margin_data_getter(
205
+ *, data_archive_path: Path | None = None, data_download_flag: bool = False
206
+ ) -> DamodaranMarginData:
207
+ """Download and parse Prof.Damodaran's margin data."""
208
+ data_archive_path = (
209
+ MGNDATA_ARCHIVE_PATH if data_archive_path is None else data_archive_path
210
+ )
211
+ if data_archive_path.is_file() and not data_download_flag:
212
+ with zipfile.ZipFile(data_archive_path) as _yzp:
213
+ margin_data_dict: DamodaranMarginData = this_yaml.load(
214
+ _yzp.read(data_archive_path.with_suffix(".yaml").name)
215
+ )
216
+ return margin_data_dict
217
+
218
+ # Get workbooks from source
219
+ elif data_download_flag or not list(data_archive_path.glob("margin*.xls")):
220
+ margin_data_downloader()
221
+
222
+ # Whitespace cleanup
223
+ ws_pat = re.compile(r"\s+")
224
+
225
+ # Parse workbooks and save margin data dictionary
226
+ margin_data_dict = {}
227
+ for _p in (WORK_DIR / "damodaran_margin_data_archive").iterdir():
228
+ xl_wbk = CalamineWorkbook.from_path(_p)
229
+ xl_wks = xl_wbk.get_sheet_by_index(
230
+ 0 if (_p.stem != "margin" and int(_p.stem[-2:]) in {17, 18, 19}) else 1
231
+ ).to_python()
232
+ if xl_wks[8][2] != "Gross Margin":
233
+ raise ValueError("Worksheet does not match expected layout.")
234
+
235
+ update = xl_wks[0][1].isoformat()[:10]
236
+ margin_data_annual = margin_data_dict.setdefault(update, {})
237
+ row_keys: list[str] = []
238
+ read_row_flag = False
239
+ for xl_row in xl_wks:
240
+ row_key = _s.upper() if isinstance((_s := xl_row[0]), str) else ""
241
+
242
+ if ws_pat.sub(" ", row_key) == "INDUSTRY NAME":
243
+ read_row_flag = True
244
+ row_keys = [_c.upper() for _c in xl_row]
245
+ continue
246
+ elif not read_row_flag or not row_key or row_key.startswith("TOTAL"):
247
+ continue
248
+ else:
249
+ xl_row[1] = int(xl_row[1])
250
+ margin_data_annual |= MappingProxyType({
251
+ row_key: MappingProxyType(
252
+ dict(zip(row_keys[1:], xl_row[1:], strict=True))
253
+ )
254
+ })
255
+
256
+ damodaran_margin_data = MappingProxyType(margin_data_dict)
257
+ with (
258
+ zipfile.ZipFile(data_archive_path, "w") as _yzp,
259
+ _yzp.open(f"{data_archive_path.stem}.yaml", "w") as _yfh,
260
+ ):
261
+ this_yaml.dump(damodaran_margin_data, _yfh)
262
+
263
+ return damodaran_margin_data
264
+
265
+
266
+ def margin_data_downloader() -> DamodaranMarginData:
267
+ """Download Prof.Damodaran's margin data."""
268
+ _u3pm = urllib3.PoolManager(ca_certs=certifi.where())
269
+ _data_source_url = "https://pages.stern.nyu.edu/~adamodar/pc/datasets/"
270
+ _archive_source_url = "https://pages.stern.nyu.edu/~adamodar/pc/archives/"
271
+
272
+ dest_dir = WORK_DIR / "damodaran_margin_data_archive"
273
+
274
+ # Get current-year margin data
275
+ workbook_name = "margin.xls"
276
+ workbook_path = dest_dir / workbook_name
277
+ if workbook_path.is_file():
278
+ workbook_path.unlink()
279
+
280
+ u3pm = urllib3.PoolManager(ca_certs=certifi.where())
281
+ download_file(u3pm, f"{_data_source_url}{workbook_name}", workbook_path)
282
+
283
+ # Get archived margin data
284
+ workbook_re = re.compile(r"margin(\d{2}).xls")
285
+ archive_html = _u3pm.request("GET", _archive_source_url).data.decode("utf-8")
286
+ archive_tree = BeautifulSoup(archive_html, "lxml")
287
+ for tag in archive_tree.find_all("a"):
288
+ if (
289
+ (_r := workbook_re.fullmatch(_w := tag.get("href", "")))
290
+ and int(_r[1]) > 16
291
+ and int(_r[1]) not in {98, 99}
292
+ ):
293
+ _url, _path = f"{_archive_source_url}{_w}", dest_dir / _w
294
+ if _path.is_file():
295
+ _path.unlink()
296
+
297
+ download_file(_u3pm, _url, _path)
298
+
299
+
300
+ def download_file(_u3pm: urllib3.PoolManager, _url: str, _path: Path) -> None:
301
+ """Download a a binary file from URL to filesystem path."""
302
+ chunk_size_ = 1024 * 1024
303
+ with (
304
+ _u3pm.request("GET", _url, preload_content=False) as _uh,
305
+ _path.open("wb") as _fh,
306
+ ):
307
+ while True:
308
+ data_ = _uh.read(chunk_size_)
309
+ if not data_:
310
+ break
311
+ _fh.write(data_)
312
+ os.utime(
313
+ _path,
314
+ times=(
315
+ (
316
+ _t := datetime.datetime.strptime(
317
+ _uh.headers["Last-Modified"], "%a, %d %b %Y %H:%M:%S %Z"
318
+ )
319
+ .astimezone(datetime.UTC)
320
+ .timestamp()
321
+ ),
322
+ _t,
323
+ ),
324
+ )
325
+
326
+ print(f"Downloaded {_url} to {_path}.")