mergeron 2025.739319.3__py3-none-any.whl → 2025.739341.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mergeron might be problematic. Click here for more details.
- mergeron/__init__.py +21 -23
- mergeron/core/__init__.py +21 -5
- mergeron/core/empirical_margin_distribution.py +216 -160
- mergeron/core/ftc_merger_investigations_data.py +31 -35
- mergeron/core/guidelines_boundaries.py +27 -20
- mergeron/core/guidelines_boundary_functions.py +22 -32
- mergeron/core/guidelines_boundary_functions_extra.py +15 -30
- mergeron/core/pseudorandom_numbers.py +21 -18
- mergeron/data/__init__.py +13 -11
- mergeron/data/damodaran_margin_data_serialized.zip +0 -0
- mergeron/gen/__init__.py +32 -41
- mergeron/gen/data_generation.py +19 -23
- mergeron/gen/data_generation_functions.py +27 -38
- mergeron/gen/enforcement_stats.py +154 -32
- mergeron/gen/upp_tests.py +4 -9
- mergeron-2025.739341.9.dist-info/METADATA +94 -0
- mergeron-2025.739341.9.dist-info/RECORD +20 -0
- {mergeron-2025.739319.3.dist-info → mergeron-2025.739341.9.dist-info}/WHEEL +1 -1
- mergeron/data/damodaran_margin_data.xls +0 -0
- mergeron/demo/__init__.py +0 -3
- mergeron/demo/visualize_empirical_margin_distribution.py +0 -94
- mergeron-2025.739319.3.dist-info/METADATA +0 -174
- mergeron-2025.739319.3.dist-info/RECORD +0 -22
mergeron/__init__.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Variables, types, objects and functions used throughout the package."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import enum
|
|
@@ -6,13 +8,14 @@ from multiprocessing import cpu_count
|
|
|
6
8
|
from pathlib import Path
|
|
7
9
|
from typing import Any, Literal
|
|
8
10
|
|
|
11
|
+
import attrs
|
|
9
12
|
import numpy as np
|
|
10
13
|
from numpy.typing import NDArray
|
|
11
14
|
from ruamel import yaml
|
|
12
15
|
|
|
13
|
-
_PKG_NAME: str = Path(__file__).parent.
|
|
16
|
+
_PKG_NAME: str = Path(__file__).parent.name
|
|
14
17
|
|
|
15
|
-
VERSION = "2025.
|
|
18
|
+
VERSION = "2025.739341.9"
|
|
16
19
|
|
|
17
20
|
__version__ = VERSION
|
|
18
21
|
|
|
@@ -34,7 +37,7 @@ EMPTY_ARRAYINT = np.array([], int)
|
|
|
34
37
|
|
|
35
38
|
NTHREADS = 2 * cpu_count()
|
|
36
39
|
|
|
37
|
-
PKG_ATTRS_MAP: dict[str,
|
|
40
|
+
PKG_ATTRS_MAP: dict[str, type] = {}
|
|
38
41
|
|
|
39
42
|
np.set_printoptions(precision=24, floatmode="fixed")
|
|
40
43
|
|
|
@@ -75,23 +78,21 @@ this_yaml.indent(mapping=2, sequence=4, offset=2)
|
|
|
75
78
|
def yaml_rt_mapper(
|
|
76
79
|
_c: yaml.constructor.RoundTripConstructor, _n: yaml.MappingNode
|
|
77
80
|
) -> Mapping[str, Any]:
|
|
78
|
-
"""
|
|
79
|
-
Constructs a mapping from a mapping node with the RoundTripConstructor
|
|
80
|
-
|
|
81
|
-
"""
|
|
81
|
+
"""Construct mapping from a mapping node with the RoundTripConstructor."""
|
|
82
82
|
data_: Mapping[str, Any] = yaml.constructor.CommentedMap()
|
|
83
83
|
_c.construct_mapping(_n, maptyp=data_, deep=True)
|
|
84
84
|
return data_
|
|
85
85
|
|
|
86
86
|
|
|
87
|
-
def yamelize_attrs(
|
|
88
|
-
_typ: object, /, *, attr_map: Mapping[str, object] = PKG_ATTRS_MAP
|
|
89
|
-
) -> None:
|
|
87
|
+
def yamelize_attrs(_typ: type, /, *, attr_map: dict[str, type] = PKG_ATTRS_MAP) -> None:
|
|
90
88
|
"""Add yaml representer, constructor for attrs-defined class.
|
|
91
89
|
|
|
92
|
-
|
|
93
|
-
|
|
90
|
+
Attributes with property, `init=False` are not serialized/deserialized
|
|
91
|
+
to YAML by the functions defined here. These attributes can, of course,
|
|
92
|
+
be dumped to stand-alone (YAML) representation, and deserialized from there.
|
|
94
93
|
"""
|
|
94
|
+
if not attrs.has(_typ):
|
|
95
|
+
raise ValueError(f"Object {_typ} is not attrs-defined")
|
|
95
96
|
|
|
96
97
|
attr_map |= {_typ.__name__: _typ}
|
|
97
98
|
|
|
@@ -99,9 +100,6 @@ def yamelize_attrs(
|
|
|
99
100
|
_typ,
|
|
100
101
|
lambda _r, _d: _r.represent_mapping(
|
|
101
102
|
f"!{_d.__class__.__name__}",
|
|
102
|
-
# construct mapping, rather than calling attrs.asdict(),
|
|
103
|
-
# to use yaml representers defined in this package for
|
|
104
|
-
# "upstream" objects
|
|
105
103
|
{_a.name: getattr(_d, _a.name) for _a in _d.__attrs_attrs__ if _a.init},
|
|
106
104
|
),
|
|
107
105
|
)
|
|
@@ -113,12 +111,13 @@ def yamelize_attrs(
|
|
|
113
111
|
|
|
114
112
|
@this_yaml.register_class
|
|
115
113
|
class Enameled(enum.Enum):
|
|
116
|
-
"""Add YAML representer, constructor for enum.Enum"""
|
|
114
|
+
"""Add YAML representer, constructor for enum.Enum."""
|
|
117
115
|
|
|
118
116
|
@classmethod
|
|
119
117
|
def to_yaml(
|
|
120
|
-
cls, _r: yaml.representer.RoundTripRepresenter, _d:
|
|
118
|
+
cls, _r: yaml.representer.RoundTripRepresenter, _d: enum.Enum
|
|
121
119
|
) -> yaml.ScalarNode:
|
|
120
|
+
"""Serialize enumerations by .name, not .value."""
|
|
122
121
|
return _r.represent_scalar(
|
|
123
122
|
f"!{super().__getattribute__(cls, '__name__')}", f"{_d.name}"
|
|
124
123
|
)
|
|
@@ -126,8 +125,10 @@ class Enameled(enum.Enum):
|
|
|
126
125
|
@classmethod
|
|
127
126
|
def from_yaml(
|
|
128
127
|
cls, _c: yaml.constructor.RoundTripConstructor, _n: yaml.ScalarNode
|
|
129
|
-
) ->
|
|
130
|
-
|
|
128
|
+
) -> enum.EnumType:
|
|
129
|
+
"""Deserialize enumeration."""
|
|
130
|
+
retval: enum.EnumType = super().__getattribute__(cls, _n.value)
|
|
131
|
+
return retval
|
|
131
132
|
|
|
132
133
|
|
|
133
134
|
@this_yaml.register_class
|
|
@@ -181,10 +182,7 @@ class RECForm(str, Enameled):
|
|
|
181
182
|
@this_yaml.register_class
|
|
182
183
|
@enum.unique
|
|
183
184
|
class UPPAggrSelector(str, Enameled):
|
|
184
|
-
"""
|
|
185
|
-
Aggregator for GUPPI and diversion ratio estimates.
|
|
186
|
-
|
|
187
|
-
"""
|
|
185
|
+
"""Aggregator for GUPPI and diversion ratio estimates."""
|
|
188
186
|
|
|
189
187
|
AVG = "average"
|
|
190
188
|
CPA = "cross-product-share weighted average"
|
mergeron/core/__init__.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Constants, types, objects and functions used within this sub-package."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
from collections.abc import Mapping
|
|
@@ -8,6 +10,7 @@ from typing import Any
|
|
|
8
10
|
import mpmath # type: ignore
|
|
9
11
|
import numpy as np
|
|
10
12
|
from attrs import cmp_using, field, frozen
|
|
13
|
+
from numpy.random import PCG64DXSM
|
|
11
14
|
|
|
12
15
|
from .. import ( # noqa: TID252
|
|
13
16
|
VERSION,
|
|
@@ -20,13 +23,15 @@ from .. import ( # noqa: TID252
|
|
|
20
23
|
|
|
21
24
|
__version__ = VERSION
|
|
22
25
|
|
|
26
|
+
DEFAULT_BITGENERATOR = PCG64DXSM
|
|
27
|
+
|
|
23
28
|
type MPFloat = mpmath.ctx_mp_python.mpf
|
|
24
|
-
type MPMatrix = mpmath.
|
|
29
|
+
type MPMatrix = mpmath.matrices.matrices._matrix
|
|
25
30
|
|
|
26
31
|
|
|
27
32
|
@frozen
|
|
28
33
|
class GuidelinesBoundary:
|
|
29
|
-
"""
|
|
34
|
+
"""Represents Guidelines boundary analytically."""
|
|
30
35
|
|
|
31
36
|
coordinates: ArrayDouble
|
|
32
37
|
"""Market-share pairs as Cartesian coordinates of points on the boundary."""
|
|
@@ -35,8 +40,19 @@ class GuidelinesBoundary:
|
|
|
35
40
|
"""Area under the boundary."""
|
|
36
41
|
|
|
37
42
|
|
|
43
|
+
@frozen
|
|
44
|
+
class GuidelinesBoundaryCallable:
|
|
45
|
+
"""A function to generate Guidelines boundary points, along with area and knot."""
|
|
46
|
+
|
|
47
|
+
boundary_function: Callable[[ArrayDouble], ArrayDouble]
|
|
48
|
+
area: float
|
|
49
|
+
s_naught: float = 0
|
|
50
|
+
|
|
51
|
+
|
|
38
52
|
@frozen
|
|
39
53
|
class INVTableData:
|
|
54
|
+
"""Represents individual table of FTC merger investigations data."""
|
|
55
|
+
|
|
40
56
|
industry_group: str
|
|
41
57
|
additional_evidence: str
|
|
42
58
|
data_array: ArrayBIGINT = field(eq=cmp_using(eq=np.array_equal))
|
|
@@ -45,7 +61,7 @@ class INVTableData:
|
|
|
45
61
|
type INVData = MappingProxyType[
|
|
46
62
|
str, MappingProxyType[str, MappingProxyType[str, INVTableData]]
|
|
47
63
|
]
|
|
48
|
-
type INVData_in =
|
|
64
|
+
type INVData_in = dict[str, dict[str, dict[str, INVTableData]]]
|
|
49
65
|
|
|
50
66
|
yamelize_attrs(INVTableData)
|
|
51
67
|
|
|
@@ -91,14 +107,14 @@ _, _ = (
|
|
|
91
107
|
|
|
92
108
|
def _dict_from_mapping(_p: Mapping[Any, Any], /) -> dict[Any, Any]:
|
|
93
109
|
retval: dict[Any, Any] = {}
|
|
94
|
-
for _k, _v in _p.items():
|
|
110
|
+
for _k, _v in _p.items():
|
|
95
111
|
retval |= {_k: _dict_from_mapping(_v)} if isinstance(_v, Mapping) else {_k: _v}
|
|
96
112
|
return retval
|
|
97
113
|
|
|
98
114
|
|
|
99
115
|
def _mappingproxy_from_mapping(_p: Mapping[Any, Any], /) -> MappingProxyType[Any, Any]:
|
|
100
116
|
retval: dict[Any, Any] = {}
|
|
101
|
-
for _k, _v in _p.items():
|
|
117
|
+
for _k, _v in _p.items():
|
|
102
118
|
retval |= (
|
|
103
119
|
{_k: _mappingproxy_from_mapping(_v)}
|
|
104
120
|
if isinstance(_v, Mapping)
|
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Functions to parse margin data compiled by
|
|
3
|
-
Prof. Aswath Damodaran, Stern School of Business, NYU.
|
|
1
|
+
"""Functions to parse margin data compiled by Prof. Aswath Damodaran, Stern School of Business, NYU.
|
|
4
2
|
|
|
5
3
|
Provides :func:`margin_data_resampler` for generating margin data
|
|
6
4
|
from an estimated Gaussian KDE from the source (margin) data.
|
|
@@ -10,9 +8,8 @@ Data are downloaded or reused from a local copy, on demand.
|
|
|
10
8
|
For terms of use of Prof. Damodaran's data, please see:
|
|
11
9
|
https://pages.stern.nyu.edu/~adamodar/New_Home_Page/datahistory.html
|
|
12
10
|
|
|
13
|
-
|
|
11
|
+
Notes
|
|
14
12
|
-----
|
|
15
|
-
|
|
16
13
|
Prof. Damodaran notes that the data construction may not be
|
|
17
14
|
consistent from iteration to iteration. He also notes that,
|
|
18
15
|
"the best use for my data is in real time corporate financial analysis
|
|
@@ -36,171 +33,51 @@ price-cost margins fall in the interval :math:`[0, 1]`.
|
|
|
36
33
|
|
|
37
34
|
"""
|
|
38
35
|
|
|
39
|
-
import
|
|
36
|
+
import datetime
|
|
37
|
+
import os
|
|
38
|
+
import re
|
|
40
39
|
import zipfile
|
|
41
40
|
from pathlib import Path
|
|
42
41
|
from types import MappingProxyType
|
|
43
42
|
|
|
43
|
+
import certifi
|
|
44
44
|
import numpy as np
|
|
45
45
|
import urllib3
|
|
46
|
-
from
|
|
47
|
-
from
|
|
46
|
+
from bs4 import BeautifulSoup
|
|
47
|
+
from joblib import Parallel, delayed # type: ignore
|
|
48
|
+
from numpy.random import Generator, SeedSequence
|
|
49
|
+
from python_calamine import CalamineWorkbook
|
|
48
50
|
from scipy import stats # type: ignore
|
|
49
|
-
from xlrd import open_workbook # type: ignore
|
|
50
51
|
|
|
51
52
|
from .. import NTHREADS, VERSION, ArrayDouble, this_yaml # noqa: TID252
|
|
52
53
|
from .. import WORK_DIR as PKG_WORK_DIR # noqa: TID252
|
|
53
|
-
from
|
|
54
|
-
from . import _mappingproxy_from_mapping
|
|
54
|
+
from . import DEFAULT_BITGENERATOR
|
|
55
55
|
|
|
56
56
|
__version__ = VERSION
|
|
57
57
|
|
|
58
58
|
WORK_DIR = globals().get("WORK_DIR", PKG_WORK_DIR)
|
|
59
|
-
"""Redefined, in case the user defines WORK_DIR
|
|
59
|
+
"""Redefined, in case the user defines WORK_DIR between module imports."""
|
|
60
60
|
|
|
61
61
|
MGNDATA_ARCHIVE_PATH = WORK_DIR / "damodaran_margin_data_serialized.zip"
|
|
62
62
|
|
|
63
63
|
type DamodaranMarginData = MappingProxyType[str, MappingProxyType[str, float | int]]
|
|
64
64
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
data_archive_path = data_archive_path or MGNDATA_ARCHIVE_PATH
|
|
80
|
-
workbook_path = data_archive_path.parent / f"damodaran_{_table_name}_data.xls"
|
|
81
|
-
if data_archive_path.is_file() and not data_download_flag:
|
|
82
|
-
with zipfile.ZipFile(data_archive_path) as _yzip:
|
|
83
|
-
margin_data_dict = this_yaml.load(
|
|
84
|
-
_yzip.read(data_archive_path.with_suffix(".yaml").name)
|
|
85
|
-
)
|
|
86
|
-
return margin_data_dict
|
|
87
|
-
elif workbook_path.is_file():
|
|
88
|
-
workbook_path.unlink()
|
|
89
|
-
if data_archive_path.is_file():
|
|
90
|
-
data_archive_path.unlink()
|
|
91
|
-
|
|
92
|
-
margin_urlstr = (
|
|
93
|
-
f"https://pages.stern.nyu.edu/~adamodar/pc/datasets/{_table_name}.xls"
|
|
94
|
-
)
|
|
95
|
-
try:
|
|
96
|
-
chunk_size_ = 1024 * 1024
|
|
97
|
-
with (
|
|
98
|
-
u3pm.request(
|
|
99
|
-
"GET", margin_urlstr, preload_content=False
|
|
100
|
-
) as _urlopen_handle,
|
|
101
|
-
workbook_path.open("wb") as margin_file,
|
|
102
|
-
):
|
|
103
|
-
while True:
|
|
104
|
-
data_ = _urlopen_handle.read(chunk_size_)
|
|
105
|
-
if not data_:
|
|
106
|
-
break
|
|
107
|
-
margin_file.write(data_)
|
|
108
|
-
|
|
109
|
-
print(f"Downloaded {margin_urlstr} to {workbook_path}.")
|
|
110
|
-
|
|
111
|
-
except urllib3.exceptions.MaxRetryError as error_:
|
|
112
|
-
if isinstance(error_.__cause__, urllib3.exceptions.SSLError):
|
|
113
|
-
# Works fine with other sites secured with certificates
|
|
114
|
-
# from the Internet2 CA, such as,
|
|
115
|
-
# https://snap.stanford.edu/data/web-Stanford.txt.gz
|
|
116
|
-
print(
|
|
117
|
-
f"WARNING: Could not establish secure connection to, {margin_urlstr}."
|
|
118
|
-
"Using bundled copy."
|
|
119
|
-
)
|
|
120
|
-
if not workbook_path.is_file():
|
|
121
|
-
shutil.copy2(mdat.DAMODARAN_MARGIN_WORKBOOK, workbook_path)
|
|
122
|
-
else:
|
|
123
|
-
raise error_
|
|
124
|
-
|
|
125
|
-
xl_book = open_workbook(workbook_path, ragged_rows=True, on_demand=True)
|
|
126
|
-
xl_sheet = xl_book.sheet_by_name("Industry Averages")
|
|
127
|
-
|
|
128
|
-
margin_dict_in: dict[str, dict[str, float | int]] = {}
|
|
129
|
-
row_keys: list[str] = []
|
|
130
|
-
read_row_flag = False
|
|
131
|
-
for _ridx in range(xl_sheet.nrows):
|
|
132
|
-
xl_row = xl_sheet.row_values(_ridx)
|
|
133
|
-
if xl_row[0] == "Industry Name":
|
|
134
|
-
read_row_flag = True
|
|
135
|
-
row_keys = xl_row
|
|
136
|
-
continue
|
|
137
|
-
|
|
138
|
-
if not xl_row[0] or not read_row_flag:
|
|
139
|
-
continue
|
|
140
|
-
|
|
141
|
-
xl_row[1] = int(xl_row[1])
|
|
142
|
-
margin_dict_in[xl_row[0]] = dict(zip(row_keys[1:], xl_row[1:], strict=True))
|
|
143
|
-
|
|
144
|
-
margin_dict = _mappingproxy_from_mapping(margin_dict_in)
|
|
145
|
-
with (
|
|
146
|
-
zipfile.ZipFile(data_archive_path, "w") as _yzip,
|
|
147
|
-
_yzip.open(f"{data_archive_path.stem}.yaml", "w") as _yfh,
|
|
148
|
-
):
|
|
149
|
-
this_yaml.dump(margin_dict, _yfh)
|
|
150
|
-
|
|
151
|
-
return margin_dict
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def margin_data_builder(
|
|
155
|
-
_src_data_dict: DamodaranMarginData | None = None, /
|
|
156
|
-
) -> tuple[ArrayDouble, ArrayDouble]:
|
|
157
|
-
if _src_data_dict is None:
|
|
158
|
-
_src_data_dict = margin_data_getter()
|
|
159
|
-
|
|
160
|
-
margin_data_wts, margin_data_obs = (
|
|
161
|
-
_f.flatten()
|
|
162
|
-
for _f in np.hsplit(
|
|
163
|
-
np.array([
|
|
164
|
-
tuple(
|
|
165
|
-
_src_data_dict[_g][_h] for _h in ["Number of firms", "Gross Margin"]
|
|
166
|
-
)
|
|
167
|
-
for _g in _src_data_dict
|
|
168
|
-
if not _g.startswith("Total Market")
|
|
169
|
-
and _g
|
|
170
|
-
not in {
|
|
171
|
-
"Bank (Money Center)",
|
|
172
|
-
"Banks (Regional)",
|
|
173
|
-
"Brokerage & Investment Banking",
|
|
174
|
-
"Financial Svcs. (Non-bank & Insurance)",
|
|
175
|
-
"Insurance (General)",
|
|
176
|
-
"Insurance (Life)",
|
|
177
|
-
"Insurance (Prop/Cas.)",
|
|
178
|
-
"Investments & Asset Management",
|
|
179
|
-
"R.E.I.T.",
|
|
180
|
-
"Retail (REITs)",
|
|
181
|
-
"Reinsurance",
|
|
182
|
-
}
|
|
183
|
-
]),
|
|
184
|
-
2,
|
|
185
|
-
)
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
margin_wtd_avg = np.average(margin_data_obs, weights=margin_data_wts)
|
|
189
|
-
# https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf
|
|
190
|
-
margin_wtd_stderr = np.sqrt(
|
|
191
|
-
np.average((margin_data_obs - margin_wtd_avg) ** 2, weights=margin_data_wts)
|
|
192
|
-
* (len(margin_data_wts) / (len(margin_data_wts) - 1))
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
return np.stack([margin_data_obs, margin_data_wts], axis=1, dtype=float), np.round(
|
|
196
|
-
(
|
|
197
|
-
margin_wtd_avg,
|
|
198
|
-
margin_wtd_stderr,
|
|
199
|
-
margin_data_obs.min(),
|
|
200
|
-
margin_data_obs.max(),
|
|
201
|
-
),
|
|
202
|
-
8,
|
|
65
|
+
FINANCIAL_INDUSTRIES = {
|
|
66
|
+
_i.upper()
|
|
67
|
+
for _i in (
|
|
68
|
+
"Bank (Money Center)",
|
|
69
|
+
"Banks (Regional)",
|
|
70
|
+
"Brokerage & Investment Banking",
|
|
71
|
+
"Financial Svcs. (Non-bank & Insurance)",
|
|
72
|
+
"Insurance (General)",
|
|
73
|
+
"Insurance (Life)",
|
|
74
|
+
"Insurance (Prop/Cas.)",
|
|
75
|
+
"Investments & Asset Management",
|
|
76
|
+
"R.E.I.T.",
|
|
77
|
+
"Retail (REITs)",
|
|
78
|
+
"Reinsurance",
|
|
203
79
|
)
|
|
80
|
+
}
|
|
204
81
|
|
|
205
82
|
|
|
206
83
|
def margin_data_resampler(
|
|
@@ -211,8 +88,7 @@ def margin_data_resampler(
|
|
|
211
88
|
seed_sequence: SeedSequence | None = None,
|
|
212
89
|
nthreads: int = NTHREADS,
|
|
213
90
|
) -> ArrayDouble:
|
|
214
|
-
"""
|
|
215
|
-
Generate draws from the empirical distribution bassed on Prof. Damodaran's margin data.
|
|
91
|
+
"""Generate draws from the empirical distribution based on Prof. Damodaran's margin data.
|
|
216
92
|
|
|
217
93
|
The empirical distribution is estimated using a Gaussian KDE; the bandwidth
|
|
218
94
|
selected using Silverman's rule is narrowed to reflect that the margin data
|
|
@@ -222,7 +98,6 @@ def margin_data_resampler(
|
|
|
222
98
|
|
|
223
99
|
Parameters
|
|
224
100
|
----------
|
|
225
|
-
|
|
226
101
|
_dist_parms
|
|
227
102
|
Array of margins and firm counts extracted from Prof. Damodaran's margin data
|
|
228
103
|
|
|
@@ -235,12 +110,12 @@ def margin_data_resampler(
|
|
|
235
110
|
|
|
236
111
|
nthreads
|
|
237
112
|
Number of threads to use in generating margin data.
|
|
113
|
+
|
|
238
114
|
Returns
|
|
239
115
|
-------
|
|
240
116
|
Array of margin values
|
|
241
117
|
|
|
242
118
|
"""
|
|
243
|
-
|
|
244
119
|
_dist_parms = margin_data_builder()[0] if _dist_parms is None else _dist_parms
|
|
245
120
|
|
|
246
121
|
_seed = seed_sequence or SeedSequence(pool_size=8)
|
|
@@ -248,23 +123,204 @@ def margin_data_resampler(
|
|
|
248
123
|
_x, _w = _dist_parms[:, 0], _dist_parms[:, 1]
|
|
249
124
|
|
|
250
125
|
margin_kde = stats.gaussian_kde(_x, weights=_w, bw_method="silverman")
|
|
126
|
+
# preserve multiplicity of modes:
|
|
251
127
|
margin_kde.set_bandwidth(bw_method=margin_kde.factor / 3.0)
|
|
252
128
|
|
|
253
129
|
if isinstance(sample_size, int):
|
|
254
|
-
|
|
130
|
+
ret_array: ArrayDouble = margin_kde.resample(
|
|
131
|
+
sample_size, seed=Generator(DEFAULT_BITGENERATOR(_seed))
|
|
132
|
+
).T
|
|
255
133
|
|
|
256
134
|
elif isinstance(sample_size, tuple) and len(sample_size) == 2:
|
|
257
135
|
ret_array = np.empty(sample_size, float)
|
|
258
136
|
|
|
259
137
|
_ssz, _ncol = sample_size
|
|
260
|
-
|
|
261
|
-
|
|
138
|
+
_threads = min(nthreads, _ncol)
|
|
139
|
+
dat_list = Parallel(n_jobs=_threads, prefer="threads")(
|
|
140
|
+
delayed(margin_kde.resample)(
|
|
141
|
+
_ssz, seed=Generator(DEFAULT_BITGENERATOR(_col_seed))
|
|
142
|
+
)
|
|
262
143
|
for _col_seed in _seed.spawn(_ncol)
|
|
263
144
|
)
|
|
264
145
|
|
|
265
146
|
for _i in range(_ncol):
|
|
266
|
-
ret_array[:,
|
|
267
|
-
|
|
268
|
-
return ret_array
|
|
147
|
+
ret_array[:, _i] = dat_list[_i][0]
|
|
269
148
|
else:
|
|
270
149
|
raise ValueError(f"Invalid sample size: {sample_size!r}")
|
|
150
|
+
|
|
151
|
+
return ret_array
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def margin_data_builder(
|
|
155
|
+
_margin_data_dict: DamodaranMarginData | None = None,
|
|
156
|
+
) -> tuple[ArrayDouble, ArrayDouble]:
|
|
157
|
+
"""Derive average firm-counts and gross-margins by industry from source data."""
|
|
158
|
+
_margin_data_dict = (
|
|
159
|
+
margin_data_getter() if _margin_data_dict is None else _margin_data_dict
|
|
160
|
+
)
|
|
161
|
+
dmd_keys = set()
|
|
162
|
+
for _k, _v in _margin_data_dict.items():
|
|
163
|
+
dmd_keys.update(set(_v.keys()))
|
|
164
|
+
dmd_keys = sorted(dmd_keys)
|
|
165
|
+
|
|
166
|
+
dist_parms = np.array([np.nan, np.nan], dtype=float)
|
|
167
|
+
for _sk in dmd_keys:
|
|
168
|
+
if _sk in FINANCIAL_INDUSTRIES or _sk.startswith("TOTAL"):
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
_missing = {"GROSS MARGIN": 0.0, "NUMBER OF FIRMS": 0.0}
|
|
172
|
+
gm, fc = zip(*[
|
|
173
|
+
[_v.get(_sk, _missing).get(_f) for _f in _missing]
|
|
174
|
+
for _k, _v in _margin_data_dict.items()
|
|
175
|
+
])
|
|
176
|
+
|
|
177
|
+
average_margin, firm_count = np.array(gm, float), np.array(fc, int)
|
|
178
|
+
# print(firm_count, average_margin)
|
|
179
|
+
dist_parms = np.vstack((
|
|
180
|
+
dist_parms,
|
|
181
|
+
np.array((
|
|
182
|
+
np.average(
|
|
183
|
+
average_margin, weights=(average_margin > 0) * (firm_count > 0)
|
|
184
|
+
),
|
|
185
|
+
np.average(firm_count, weights=(average_margin > 0) * (firm_count > 0)),
|
|
186
|
+
)),
|
|
187
|
+
))
|
|
188
|
+
|
|
189
|
+
dist_parms = dist_parms[1:, :]
|
|
190
|
+
|
|
191
|
+
obs_, wts_ = (dist_parms[:, _f] for _f in range(2))
|
|
192
|
+
|
|
193
|
+
avg_gm, num_firms = np.average(obs_, weights=wts_, returned=True)
|
|
194
|
+
std_gm = np.sqrt(
|
|
195
|
+
np.average((obs_ - avg_gm) ** 2, weights=wts_)
|
|
196
|
+
* num_firms
|
|
197
|
+
* len(obs_)
|
|
198
|
+
/ ((num_firms - len(obs_)) * (len(obs_) - 1))
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return dist_parms, np.array([avg_gm, std_gm, obs_.min(), obs_.max()], float)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def margin_data_getter(
|
|
205
|
+
*, data_archive_path: Path | None = None, data_download_flag: bool = False
|
|
206
|
+
) -> DamodaranMarginData:
|
|
207
|
+
"""Download and parse Prof.Damodaran's margin data."""
|
|
208
|
+
data_archive_path = (
|
|
209
|
+
MGNDATA_ARCHIVE_PATH if data_archive_path is None else data_archive_path
|
|
210
|
+
)
|
|
211
|
+
if data_archive_path.is_file() and not data_download_flag:
|
|
212
|
+
with zipfile.ZipFile(data_archive_path) as _yzp:
|
|
213
|
+
margin_data_dict: DamodaranMarginData = this_yaml.load(
|
|
214
|
+
_yzp.read(data_archive_path.with_suffix(".yaml").name)
|
|
215
|
+
)
|
|
216
|
+
return margin_data_dict
|
|
217
|
+
|
|
218
|
+
# Get workbooks from source
|
|
219
|
+
elif data_download_flag or not list(data_archive_path.glob("margin*.xls")):
|
|
220
|
+
margin_data_downloader()
|
|
221
|
+
|
|
222
|
+
# Whitespace cleanup
|
|
223
|
+
ws_pat = re.compile(r"\s+")
|
|
224
|
+
|
|
225
|
+
# Parse workbooks and save margin data dictionary
|
|
226
|
+
margin_data_dict = {}
|
|
227
|
+
for _p in (WORK_DIR / "damodaran_margin_data_archive").iterdir():
|
|
228
|
+
xl_wbk = CalamineWorkbook.from_path(_p)
|
|
229
|
+
xl_wks = xl_wbk.get_sheet_by_index(
|
|
230
|
+
0 if (_p.stem != "margin" and int(_p.stem[-2:]) in {17, 18, 19}) else 1
|
|
231
|
+
).to_python()
|
|
232
|
+
if xl_wks[8][2] != "Gross Margin":
|
|
233
|
+
raise ValueError("Worksheet does not match expected layout.")
|
|
234
|
+
|
|
235
|
+
update = xl_wks[0][1].isoformat()[:10]
|
|
236
|
+
margin_data_annual = margin_data_dict.setdefault(update, {})
|
|
237
|
+
row_keys: list[str] = []
|
|
238
|
+
read_row_flag = False
|
|
239
|
+
for xl_row in xl_wks:
|
|
240
|
+
row_key = _s.upper() if isinstance((_s := xl_row[0]), str) else ""
|
|
241
|
+
|
|
242
|
+
if ws_pat.sub(" ", row_key) == "INDUSTRY NAME":
|
|
243
|
+
read_row_flag = True
|
|
244
|
+
row_keys = [_c.upper() for _c in xl_row]
|
|
245
|
+
continue
|
|
246
|
+
elif not read_row_flag or not row_key or row_key.startswith("TOTAL"):
|
|
247
|
+
continue
|
|
248
|
+
else:
|
|
249
|
+
xl_row[1] = int(xl_row[1])
|
|
250
|
+
margin_data_annual |= MappingProxyType({
|
|
251
|
+
row_key: MappingProxyType(
|
|
252
|
+
dict(zip(row_keys[1:], xl_row[1:], strict=True))
|
|
253
|
+
)
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
damodaran_margin_data = MappingProxyType(margin_data_dict)
|
|
257
|
+
with (
|
|
258
|
+
zipfile.ZipFile(data_archive_path, "w") as _yzp,
|
|
259
|
+
_yzp.open(f"{data_archive_path.stem}.yaml", "w") as _yfh,
|
|
260
|
+
):
|
|
261
|
+
this_yaml.dump(damodaran_margin_data, _yfh)
|
|
262
|
+
|
|
263
|
+
return damodaran_margin_data
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def margin_data_downloader() -> DamodaranMarginData:
|
|
267
|
+
"""Download Prof.Damodaran's margin data."""
|
|
268
|
+
_u3pm = urllib3.PoolManager(ca_certs=certifi.where())
|
|
269
|
+
_data_source_url = "https://pages.stern.nyu.edu/~adamodar/pc/datasets/"
|
|
270
|
+
_archive_source_url = "https://pages.stern.nyu.edu/~adamodar/pc/archives/"
|
|
271
|
+
|
|
272
|
+
dest_dir = WORK_DIR / "damodaran_margin_data_archive"
|
|
273
|
+
|
|
274
|
+
# Get current-year margin data
|
|
275
|
+
workbook_name = "margin.xls"
|
|
276
|
+
workbook_path = dest_dir / workbook_name
|
|
277
|
+
if workbook_path.is_file():
|
|
278
|
+
workbook_path.unlink()
|
|
279
|
+
|
|
280
|
+
u3pm = urllib3.PoolManager(ca_certs=certifi.where())
|
|
281
|
+
download_file(u3pm, f"{_data_source_url}{workbook_name}", workbook_path)
|
|
282
|
+
|
|
283
|
+
# Get archived margin data
|
|
284
|
+
workbook_re = re.compile(r"margin(\d{2}).xls")
|
|
285
|
+
archive_html = _u3pm.request("GET", _archive_source_url).data.decode("utf-8")
|
|
286
|
+
archive_tree = BeautifulSoup(archive_html, "lxml")
|
|
287
|
+
for tag in archive_tree.find_all("a"):
|
|
288
|
+
if (
|
|
289
|
+
(_r := workbook_re.fullmatch(_w := tag.get("href", "")))
|
|
290
|
+
and int(_r[1]) > 16
|
|
291
|
+
and int(_r[1]) not in {98, 99}
|
|
292
|
+
):
|
|
293
|
+
_url, _path = f"{_archive_source_url}{_w}", dest_dir / _w
|
|
294
|
+
if _path.is_file():
|
|
295
|
+
_path.unlink()
|
|
296
|
+
|
|
297
|
+
download_file(_u3pm, _url, _path)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def download_file(_u3pm: urllib3.PoolManager, _url: str, _path: Path) -> None:
|
|
301
|
+
"""Download a a binary file from URL to filesystem path."""
|
|
302
|
+
chunk_size_ = 1024 * 1024
|
|
303
|
+
with (
|
|
304
|
+
_u3pm.request("GET", _url, preload_content=False) as _uh,
|
|
305
|
+
_path.open("wb") as _fh,
|
|
306
|
+
):
|
|
307
|
+
while True:
|
|
308
|
+
data_ = _uh.read(chunk_size_)
|
|
309
|
+
if not data_:
|
|
310
|
+
break
|
|
311
|
+
_fh.write(data_)
|
|
312
|
+
os.utime(
|
|
313
|
+
_path,
|
|
314
|
+
times=(
|
|
315
|
+
(
|
|
316
|
+
_t := datetime.datetime.strptime(
|
|
317
|
+
_uh.headers["Last-Modified"], "%a, %d %b %Y %H:%M:%S %Z"
|
|
318
|
+
)
|
|
319
|
+
.astimezone(datetime.UTC)
|
|
320
|
+
.timestamp()
|
|
321
|
+
),
|
|
322
|
+
_t,
|
|
323
|
+
),
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
print(f"Downloaded {_url} to {_path}.")
|