mergeron 2024.738973.0__py3-none-any.whl → 2024.739079.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mergeron might be problematic. Click here for more details.

Files changed (37) hide show
  1. mergeron/__init__.py +28 -3
  2. mergeron/core/__init__.py +2 -77
  3. mergeron/core/damodaran_margin_data.py +66 -52
  4. mergeron/core/excel_helper.py +39 -37
  5. mergeron/core/ftc_merger_investigations_data.py +66 -35
  6. mergeron/core/guidelines_boundaries.py +261 -234
  7. mergeron/core/guidelines_boundary_functions.py +182 -27
  8. mergeron/core/guidelines_boundary_functions_extra.py +17 -14
  9. mergeron/core/proportions_tests.py +2 -4
  10. mergeron/core/pseudorandom_numbers.py +6 -11
  11. mergeron/data/__init__.py +3 -0
  12. mergeron/data/damodaran_margin_data.xls +0 -0
  13. mergeron/data/damodaran_margin_data_dict.msgpack +0 -0
  14. mergeron/{jinja_LaTex_templates/setup_tikz_tables.tex.jinja2 → data/jinja2_LaTeX_templates/setup_tikz_tables.tex} +45 -50
  15. mergeron/demo/__init__.py +3 -0
  16. mergeron/demo/visualize_empirical_margin_distribution.py +88 -0
  17. mergeron/ext/__init__.py +2 -4
  18. mergeron/ext/tol_colors.py +3 -3
  19. mergeron/gen/__init__.py +53 -46
  20. mergeron/gen/_data_generation_functions.py +28 -93
  21. mergeron/gen/data_generation.py +20 -24
  22. mergeron/gen/{investigations_stats.py → enforcement_stats.py} +59 -57
  23. mergeron/gen/market_sample.py +6 -10
  24. mergeron/gen/upp_tests.py +29 -26
  25. mergeron-2024.739079.10.dist-info/METADATA +109 -0
  26. mergeron-2024.739079.10.dist-info/RECORD +36 -0
  27. mergeron/core/InCommon RSA Server CA cert chain.pem +0 -68
  28. mergeron-2024.738973.0.dist-info/METADATA +0 -108
  29. mergeron-2024.738973.0.dist-info/RECORD +0 -32
  30. /mergeron/{core → data}/ftc_invdata.msgpack +0 -0
  31. /mergeron/{jinja_LaTex_templates → data/jinja2_LaTeX_templates}/clrrate_cis_summary_table_template.tex.jinja2 +0 -0
  32. /mergeron/{jinja_LaTex_templates → data/jinja2_LaTeX_templates}/ftcinvdata_byhhianddelta_table_template.tex.jinja2 +0 -0
  33. /mergeron/{jinja_LaTex_templates → data/jinja2_LaTeX_templates}/ftcinvdata_summary_table_template.tex.jinja2 +0 -0
  34. /mergeron/{jinja_LaTex_templates → data/jinja2_LaTeX_templates}/ftcinvdata_summarypaired_table_template.tex.jinja2 +0 -0
  35. /mergeron/{jinja_LaTex_templates → data/jinja2_LaTeX_templates}/mergeron.cls +0 -0
  36. /mergeron/{jinja_LaTex_templates → data/jinja2_LaTeX_templates}/mergeron_table_collection_template.tex.jinja2 +0 -0
  37. {mergeron-2024.738973.0.dist-info → mergeron-2024.739079.10.dist-info}/WHEEL +0 -0
mergeron/__init__.py CHANGED
@@ -1,12 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import enum
4
- from importlib.metadata import version
5
4
  from pathlib import Path
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ import pendulum # type: ignore
9
+ from icecream import argumentToString, ic, install # type: ignore
10
+ from numpy.typing import NDArray
6
11
 
7
12
  _PKG_NAME: str = Path(__file__).parent.stem
8
13
 
9
- __version__ = version(_PKG_NAME)
14
+ VERSION = "2024.739079.10"
15
+
16
+ __version__ = VERSION
10
17
 
11
18
  DATA_DIR: Path = Path.home() / _PKG_NAME
12
19
  """
@@ -14,11 +21,26 @@ Defines a subdirectory named for this package in the user's home path.
14
21
 
15
22
  If the subdirectory doesn't exist, it is created on package invocation.
16
23
  """
17
-
18
24
  if not DATA_DIR.is_dir():
19
25
  DATA_DIR.mkdir(parents=False)
20
26
 
21
27
 
28
+ np.set_printoptions(precision=18)
29
+
30
+
31
+ def _timestamper() -> str:
32
+ return f"{pendulum.now().strftime("%F %T.%f")} |> "
33
+
34
+
35
+ @argumentToString.register(np.ndarray) # type: ignore
36
+ def _(_obj: NDArray[Any]) -> str:
37
+ return f"ndarray, shape={_obj.shape}, dtype={_obj.dtype}"
38
+
39
+
40
+ ic.configureOutput(prefix=_timestamper, includeContext=True)
41
+ install()
42
+
43
+
22
44
  @enum.unique
23
45
  class RECConstants(enum.StrEnum):
24
46
  """Recapture rate - derivation methods."""
@@ -38,8 +60,11 @@ class UPPAggrSelector(enum.StrEnum):
38
60
  AVG = "average"
39
61
  CPA = "cross-product-share weighted average"
40
62
  CPD = "cross-product-share weighted distance"
63
+ CPG = "cross-product-share weighted geometric mean"
41
64
  DIS = "symmetrically-weighted distance"
65
+ GMN = "geometric mean"
42
66
  MAX = "max"
43
67
  MIN = "min"
44
68
  OSA = "own-share weighted average"
45
69
  OSD = "own-share weighted distance"
70
+ OSG = "own-share weighted geometric mean"
mergeron/core/__init__.py CHANGED
@@ -1,78 +1,3 @@
1
- from __future__ import annotations
1
+ from .. import VERSION # noqa: TID252
2
2
 
3
- from dataclasses import dataclass
4
- from importlib.metadata import version
5
-
6
- import numpy as np
7
- from attrs import Attribute, field, frozen, validators
8
- from numpy.typing import NDArray
9
-
10
- from .. import _PKG_NAME, RECConstants, UPPAggrSelector # noqa: TID252
11
-
12
- __version__ = version(_PKG_NAME)
13
-
14
-
15
- @dataclass(frozen=True)
16
- class GuidelinesBoundary:
17
- coordinates: NDArray[np.float64]
18
- area: float
19
-
20
-
21
- def _divr_value_validator(
22
- _instance: UPPBoundarySpec, _attribute: Attribute[float], _value: float, /
23
- ) -> None:
24
- if not 0 <= _value <= 1:
25
- raise ValueError(
26
- "Margin-adjusted benchmark share ratio must lie between 0 and 1."
27
- )
28
-
29
-
30
- def _rec_spec_validator(
31
- _instance: UPPBoundarySpec,
32
- _attribute: Attribute[RECConstants],
33
- _value: RECConstants,
34
- /,
35
- ) -> None:
36
- if _value == RECConstants.OUTIN:
37
- raise ValueError(
38
- f"Invalid recapture specification, {_value!r}. "
39
- "You may consider specifying RECConstants.INOUT here, and "
40
- "assigning the recapture rate for the merging-firm with "
41
- 'the smaller market-share to the attribue, "rec" of '
42
- "the UPPBoundarySpec object you pass."
43
- )
44
- if _value is None and _instance.agg_method != UPPAggrSelector.MAX:
45
- raise ValueError(
46
- f"Specified aggregation method, {_instance.agg_method} requires a recapture specification."
47
- )
48
-
49
-
50
- @frozen
51
- class UPPBoundarySpec:
52
- diversion_ratio: float = field(
53
- kw_only=False,
54
- default=0.045,
55
- validator=(validators.instance_of(float), _divr_value_validator),
56
- )
57
- rec: float = field(
58
- kw_only=False, default=0.855, validator=validators.instance_of(float)
59
- )
60
-
61
- agg_method: UPPAggrSelector = field(
62
- kw_only=True,
63
- default=UPPAggrSelector.MAX,
64
- validator=validators.instance_of(UPPAggrSelector),
65
- )
66
-
67
- recapture_form: RECConstants | None = field(
68
- kw_only=True,
69
- default=RECConstants.INOUT,
70
- validator=(
71
- validators.instance_of((type(None), RECConstants)),
72
- _rec_spec_validator,
73
- ),
74
- )
75
-
76
- precision: int = field(
77
- kw_only=False, default=5, validator=validators.instance_of(int)
78
- )
3
+ __version__ = VERSION
@@ -7,7 +7,8 @@ Data are downloaded or reused from a local copy, on demand.
7
7
  For terms of use of Prof. Damodaran's data, please see:
8
8
  https://pages.stern.nyu.edu/~adamodar/New_Home_Page/datahistory.html
9
9
 
10
- Important caveats:
10
+ NOTES
11
+ -----
11
12
 
12
13
  Prof. Damodaran notes that the data construction may not be
13
14
  consistent from iteration to iteration. He also notes that,
@@ -32,29 +33,30 @@ price-cost margins fall in the interval :math:`[0, 1]`.
32
33
 
33
34
  """
34
35
 
36
+ import shutil
35
37
  from collections.abc import Mapping
36
- from importlib.metadata import version
38
+ from importlib import resources
37
39
  from pathlib import Path
38
40
  from types import MappingProxyType
39
41
 
40
42
  import msgpack # type:ignore
41
43
  import numpy as np
42
- import requests
44
+ import urllib3
43
45
  from numpy.random import PCG64DXSM, Generator, SeedSequence
44
46
  from numpy.typing import NDArray
45
- from requests_toolbelt.downloadutils import stream # type: ignore
46
47
  from scipy import stats # type: ignore
47
48
  from xlrd import open_workbook # type: ignore
48
49
 
49
- from .. import _PKG_NAME, DATA_DIR # noqa: TID252
50
-
51
- __version__ = version(_PKG_NAME)
50
+ from .. import _PKG_NAME, DATA_DIR, VERSION # noqa: TID252
52
51
 
52
+ __version__ = VERSION
53
53
 
54
54
  MGNDATA_ARCHIVE_PATH = DATA_DIR / "damodaran_margin_data_dict.msgpack"
55
55
 
56
+ u3pm = urllib3.PoolManager()
57
+
56
58
 
57
- def scrape_data_table(
59
+ def mgn_data_getter(
58
60
  _table_name: str = "margin",
59
61
  *,
60
62
  data_archive_path: Path | None = None,
@@ -68,32 +70,46 @@ def scrape_data_table(
68
70
  _data_archive_path = data_archive_path or MGNDATA_ARCHIVE_PATH
69
71
 
70
72
  _mgn_urlstr = f"https://pages.stern.nyu.edu/~adamodar/pc/datasets/{_table_name}.xls"
71
- _mgn_path = _data_archive_path.parent.joinpath(f"damodaran_{_table_name}_data.xls")
73
+ _mgn_path = _data_archive_path.parent / f"damodaran_{_table_name}_data.xls"
72
74
  if _data_archive_path.is_file() and not data_download_flag:
73
75
  return MappingProxyType(msgpack.unpackb(_data_archive_path.read_bytes()))
74
76
  elif _mgn_path.is_file():
75
77
  _mgn_path.unlink()
76
- _data_archive_path.unlink()
77
-
78
- _REQ_TIMEOUT = (9.05, 27)
79
- # NYU will eventually updates its server certificate, to one signed with
80
- # "InCommon RSA Server CA 2.pem", the step below will be obsolete. In
81
- # the interim, it is necessary to provide the certificate chain to the
82
- # root CA, so that the obsolete CA certificate is validated.
83
- _INCOMMON_2014_CERT_CHAIN_PATH = (
84
- Path(__file__).parent / "InCommon RSA Server CA cert chain.pem"
85
- )
86
- try:
87
- _urlopen_handle = requests.get(_mgn_urlstr, timeout=_REQ_TIMEOUT, stream=True)
88
- except requests.exceptions.SSLError:
89
- _urlopen_handle = requests.get(
90
- _mgn_urlstr,
91
- timeout=_REQ_TIMEOUT,
92
- stream=True,
93
- verify=str(_INCOMMON_2014_CERT_CHAIN_PATH),
94
- )
78
+ if _data_archive_path.is_file():
79
+ _data_archive_path.unlink()
95
80
 
96
- _mgn_filename = stream.stream_response_to_file(_urlopen_handle, path=_mgn_path)
81
+ try:
82
+ _chunk_size = 1024 * 1024
83
+ with (
84
+ u3pm.request("GET", _mgn_urlstr, preload_content=False) as _urlopen_handle,
85
+ _mgn_path.open("wb") as _mgn_file,
86
+ ):
87
+ while True:
88
+ _data = _urlopen_handle.read(_chunk_size)
89
+ if not _data:
90
+ break
91
+ _mgn_file.write(_data)
92
+
93
+ print(f"Downloaded {_mgn_urlstr} to {_mgn_path}.")
94
+
95
+ except urllib3.exceptions.MaxRetryError as _err:
96
+ if isinstance(_err.__cause__, urllib3.exceptions.SSLError):
97
+ # Works fine with other sites secured with certificates
98
+ # from the Internet2 CA, such as,
99
+ # https://snap.stanford.edu/data/web-Stanford.txt.gz
100
+ print(
101
+ f"WARNING: Could not establish secure connection to, {_mgn_urlstr}."
102
+ "Using bundled copy."
103
+ )
104
+ if not _mgn_path.is_file():
105
+ with resources.as_file(
106
+ resources.files(f"{_PKG_NAME}.data").joinpath(
107
+ "damodaran_margin_data.xls"
108
+ )
109
+ ) as _mgn_data_archive_path:
110
+ shutil.copy2(_mgn_data_archive_path, _mgn_path)
111
+ else:
112
+ raise _err
97
113
 
98
114
  _xl_book = open_workbook(_mgn_path, ragged_rows=True, on_demand=True)
99
115
  _xl_sheet = _xl_book.sheet_by_name("Industry Averages")
@@ -123,7 +139,7 @@ def mgn_data_builder(
123
139
  _mgn_tbl_dict: Mapping[str, Mapping[str, float | int]] | None = None, /
124
140
  ) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
125
141
  if _mgn_tbl_dict is None:
126
- _mgn_tbl_dict = scrape_data_table()
142
+ _mgn_tbl_dict = mgn_data_getter()
127
143
 
128
144
  _mgn_data_wts, _mgn_data_obs = (
129
145
  _f.flatten()
@@ -169,17 +185,19 @@ def mgn_data_builder(
169
185
  )
170
186
 
171
187
 
172
- def resample_mgn_data(
188
+ def mgn_data_resampler(
173
189
  _sample_size: int | tuple[int, int] = (10**6, 2),
174
190
  /,
175
191
  *,
176
192
  seed_sequence: SeedSequence | None = None,
177
193
  ) -> NDArray[np.float64]:
178
194
  """
179
- Generate the specified number of draws from the empirical distribution
180
- for Prof. Damodaran's margin data using the estimated Gaussian KDE.
181
- Margins for firms in finance, investment, insurance, reinsurance, and REITs
182
- are excluded from the sample used to estimate the Gaussian KDE.
195
+ Generate draws from the empirical distribution bassed on Prof. Damodaran's margin data.
196
+
197
+ The empirical distribution is estimated using a Gaussian KDE; the bandwidth
198
+ selected using Silverman's rule is narrowed to reflect that the margin data
199
+ are multimodal. Margins for firms in finance, investment, insurance, reinsurance, and
200
+ REITs are excluded from the sample used to estimate the empirical distribution.
183
201
 
184
202
  Parameters
185
203
  ----------
@@ -198,28 +216,24 @@ def resample_mgn_data(
198
216
 
199
217
  _seed_sequence = seed_sequence or SeedSequence(pool_size=8)
200
218
 
201
- _x, _w, _ = mgn_data_builder(scrape_data_table())
219
+ _x, _w, _ = mgn_data_builder(mgn_data_getter())
202
220
 
203
- _mgn_kde = stats.gaussian_kde(_x, weights=_w)
221
+ _mgn_kde = stats.gaussian_kde(_x, weights=_w, bw_method="silverman")
222
+ _mgn_kde.set_bandwidth(bw_method=_mgn_kde.factor / 3.0)
204
223
 
205
- def _generate_draws(
206
- _mgn_kde: stats.gaussian_kde, _ssz: int, _seed_seq: SeedSequence
207
- ) -> NDArray[np.float64]:
208
- _seed = Generator(PCG64DXSM(_seed_sequence))
209
-
210
- # We enlarge the sample, then truncate to
211
- # the range between [0.0, 1.0)
212
- ssz_up = int(_ssz / (_mgn_kde.integrate_box_1d(0.0, 1.0) ** 2))
213
- sample_1 = _mgn_kde.resample(ssz_up, seed=_seed)[0]
224
+ if isinstance(_sample_size, int):
214
225
  return np.array(
215
- sample_1[(sample_1 >= 0.0) & (sample_1 <= 1)][:_ssz], np.float64
226
+ _mgn_kde.resample(_sample_size, seed=Generator(PCG64DXSM(_seed_sequence)))[
227
+ 0
228
+ ]
216
229
  )
217
-
218
- if isinstance(_sample_size, int):
219
- return _generate_draws(_mgn_kde, _sample_size, _seed_sequence)
220
- else:
230
+ elif isinstance(_sample_size, tuple) and len(_sample_size) == 2:
221
231
  _ssz, _num_cols = _sample_size
222
232
  _ret_array = np.empty(_sample_size, np.float64)
223
233
  for _idx, _seed_seq in enumerate(_seed_sequence.spawn(_num_cols)):
224
- _ret_array[:, _idx] = _generate_draws(_mgn_kde, _ssz, _seed_seq)
234
+ _ret_array[:, _idx] = _mgn_kde.resample(
235
+ _ssz, seed=Generator(PCG64DXSM(_seed_seq))
236
+ )[0]
225
237
  return _ret_array
238
+ else:
239
+ raise ValueError(f"Invalid sample size: {_sample_size!r}")
@@ -8,29 +8,21 @@ Includes a flexible system of defining cell formats.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- import enum
12
11
  from collections.abc import Mapping, Sequence
13
- from importlib.metadata import version
14
- from types import MappingProxyType
15
- from typing import Any
12
+ from typing import Any, ClassVar
16
13
 
17
14
  import numpy as np
18
15
  import numpy.typing as npt
19
16
  import xlsxwriter # type: ignore
17
+ from aenum import Enum, unique # type: ignore
20
18
 
21
- from .. import _PKG_NAME # noqa: TID252
19
+ from .. import VERSION # noqa: TID252
22
20
 
23
- __version__ = version(_PKG_NAME)
21
+ __version__ = VERSION
24
22
 
25
23
 
26
- @enum.unique
27
- class CFmtParent(dict[str, Any], enum.ReprEnum): # type: ignore
28
- """Unique mappings defining xlsxwirter Workbook formats"""
29
-
30
- ...
31
-
32
-
33
- class CFmt(CFmtParent):
24
+ @unique
25
+ class CFmt(Enum): # type: ignore
34
26
  """
35
27
  Initialize cell formats for xlsxwriter.
36
28
 
@@ -42,31 +34,41 @@ class CFmt(CFmtParent):
42
34
  See, https://xlsxwriter.readthedocs.io/format.html
43
35
  """
44
36
 
45
- XL_DEFAULT = MappingProxyType({"font_name": "Calibri", "font_size": 11})
46
- XL_DEFAULT_2003 = MappingProxyType({"font_name": "Arial", "font_size": 10})
37
+ XL_DEFAULT: ClassVar = {"font_name": "Calibri", "font_size": 11}
38
+ XL_DEFAULT_2003: ClassVar = {"font_name": "Arial", "font_size": 10}
39
+
40
+ A_CTR: ClassVar = {"align": "center"}
41
+ A_CTR_ACROSS: ClassVar = {"align": "center_across"}
42
+ A_LEFT: ClassVar = {"align": "left"}
43
+ A_RIGHT: ClassVar = {"align": "right"}
47
44
 
48
- A_CTR = MappingProxyType({"align": "center"})
49
- A_CTR_ACROSS = MappingProxyType({"align": "center_across"})
50
- A_LEFT = MappingProxyType({"align": "left"})
51
- A_RIGHT = MappingProxyType({"align": "right"})
45
+ BOLD: ClassVar = {"bold": True}
46
+ BOLD_ITALIC: ClassVar = {"bold": True, "italic": True}
47
+ ITALIC: ClassVar = {"italic": True}
48
+ ULINE: ClassVar = {"underline": True}
52
49
 
53
- BOLD = MappingProxyType({"bold": True})
54
- ITALIC = MappingProxyType({"italic": True})
55
- ULINE = MappingProxyType({"underline": True})
50
+ TEXT_WRAP: ClassVar = {"text_wrap": True}
51
+ TEXT_ROTATE: ClassVar = {"rotation": 90}
52
+ IND_1: ClassVar = {"indent": 1}
56
53
 
57
- TEXT_WRAP = MappingProxyType({"text_wrap": True})
58
- IND_1 = MappingProxyType({"indent": 1})
54
+ DOLLAR_NUM: ClassVar = {"num_format": "[$$-409]#,##0.00"}
55
+ DT_NUM: ClassVar = {"num_format": "mm/dd/yyyy"}
56
+ QTY_NUM: ClassVar = {"num_format": "#,##0.0"}
57
+ PCT_NUM: ClassVar = {"num_format": "##0%"}
58
+ PCT2_NUM: ClassVar = {"num_format": "##0.00%"}
59
+ PCT4_NUM: ClassVar = {"num_format": "##0.0000%"}
60
+ PCT6_NUM: ClassVar = {"num_format": "##0.000000%"}
61
+ PCT8_NUM: ClassVar = {"num_format": "##0.00000000%"}
62
+ AREA_NUM: ClassVar = {"num_format": "0.00000000"}
59
63
 
60
- DOLLAR_NUM = MappingProxyType({"num_format": "[$$-409]#,##0.00"})
61
- DT_NUM = MappingProxyType({"num_format": "mm/dd/yyyy"})
62
- QTY_NUM = MappingProxyType({"num_format": "#,##0.0"})
63
- PCT_NUM = MappingProxyType({"num_format": "##0.000000%"})
64
- AREA_NUM = MappingProxyType({"num_format": "0.00000000"})
64
+ BAR_FILL: ClassVar = {"pattern": 1, "bg_color": "dfeadf"}
65
+ HDR_FILL: ClassVar = {"pattern": 1, "bg_color": "999999"}
65
66
 
66
- BAR_FILL = MappingProxyType({"pattern": 1, "bg_color": "dfeadf"})
67
- BOT_BORDER = MappingProxyType({"bottom": 1, "bottom_color": "000000"})
68
- TOP_BORDER = MappingProxyType({"top": 1, "top_color": "000000"})
69
- HDR_BORDER = TOP_BORDER | BOT_BORDER
67
+ LEFT_BORDER: ClassVar = {"left": 1, "left_color": "000000"}
68
+ RIGHT_BORDER: ClassVar = {"right": 1, "right_color": "000000"}
69
+ BOT_BORDER: ClassVar = {"bottom": 1, "bottom_color": "000000"}
70
+ TOP_BORDER: ClassVar = {"top": 1, "top_color": "000000"}
71
+ HDR_BORDER: ClassVar = TOP_BORDER | BOT_BORDER
70
72
 
71
73
 
72
74
  def matrix_to_sheet(
@@ -208,7 +210,7 @@ def xl_fmt(
208
210
  _xl_book: xlsxwriter.Workbook, _cell_fmt: Sequence[CFmt] | CFmt | None, /
209
211
  ) -> xlsxwriter.format.Format:
210
212
  """
211
- Return :code:`xlsxwriter` `Format` object given a CFmt enum, or tuple thereof.
213
+ Return :code:`xlsxwriter` `Format` object given a CFmt aenum, or tuple thereof.
212
214
 
213
215
  Parameters
214
216
  ----------
@@ -216,14 +218,14 @@ def xl_fmt(
216
218
  :code:`xlsxwriter.Workbook` object
217
219
 
218
220
  _cell_fmt
219
- :code:`CFmt` enum object, or tuple thereof
221
+ :code:`CFmt` aenum object, or tuple thereof
220
222
 
221
223
  Returns
222
224
  -------
223
225
  :code:`xlsxwriter` `Format` object
224
226
 
225
227
  """
226
- _cell_fmt_dict: Mapping[str, Any] = MappingProxyType({})
228
+ _cell_fmt_dict: Mapping[str, Any] = {}
227
229
  if isinstance(_cell_fmt, tuple):
228
230
  ensure_cell_format_spec_tuple(_cell_fmt)
229
231
  for _cf in _cell_fmt:
@@ -4,13 +4,13 @@ as necessary
4
4
 
5
5
  NOTES
6
6
  -----
7
- We drop reported row and column totals from source data for reducing stored data.
7
+ Reported row and column totals from source data are not stored.
8
8
 
9
9
  """
10
10
 
11
11
  import shutil
12
12
  from collections.abc import Mapping, Sequence
13
- from importlib.metadata import version
13
+ from importlib import resources
14
14
  from operator import itemgetter
15
15
  from pathlib import Path
16
16
  from types import MappingProxyType
@@ -22,12 +22,13 @@ import numpy as np
22
22
  import re2 as re # type: ignore
23
23
  import requests
24
24
  from bs4 import BeautifulSoup
25
+ from icecream import ic # type: ignore
25
26
  from numpy.testing import assert_array_equal
26
27
  from numpy.typing import NDArray
27
28
 
28
- from .. import _PKG_NAME, DATA_DIR # noqa: TID252
29
+ from .. import _PKG_NAME, DATA_DIR, VERSION # noqa: TID252
29
30
 
30
- __version__ = version(_PKG_NAME)
31
+ __version__ = VERSION
31
32
 
32
33
  m.patch()
33
34
 
@@ -36,11 +37,16 @@ if not FTCDATA_DIR.is_dir():
36
37
  FTCDATA_DIR.mkdir(parents=True)
37
38
 
38
39
  INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.msgpack"
39
- if not INVDATA_ARCHIVE_PATH.is_file():
40
- if (
41
- _bundled_copy := Path(__file__).parent.joinpath(INVDATA_ARCHIVE_PATH.name)
42
- ).is_file():
43
- shutil.copyfile(_bundled_copy, INVDATA_ARCHIVE_PATH)
40
+ if (
41
+ not INVDATA_ARCHIVE_PATH.is_file()
42
+ and (
43
+ _bundled_copy := resources.files(f"{_PKG_NAME}.data").joinpath(
44
+ INVDATA_ARCHIVE_PATH.name
45
+ )
46
+ ).is_file()
47
+ ):
48
+ with resources.as_file(_bundled_copy) as _bundled_copy_path:
49
+ shutil.copy2(_bundled_copy_path, INVDATA_ARCHIVE_PATH)
44
50
 
45
51
  TABLE_NO_RE = re.compile(r"Table \d+\.\d+")
46
52
  TABLE_TYPES = ("ByHHIandDelta", "ByFirmCount")
@@ -86,8 +92,8 @@ CNT_FCOUNT_DICT = {
86
92
 
87
93
 
88
94
  class INVTableData(NamedTuple):
89
- ind_grp: str
90
- evid_cond: str
95
+ industry_group: str
96
+ additional_evidence: str
91
97
  data_array: NDArray[np.int64]
92
98
 
93
99
 
@@ -181,7 +187,9 @@ def construct_data(
181
187
  _aggr_tables_list = [
182
188
  _t
183
189
  for _t in _invdata["1996-2003"][_table_type]
184
- if re.sub(r"\W", "", _invdata["1996-2003"][_table_type][_t].ind_grp)
190
+ if re.sub(
191
+ r"\W", "", _invdata["1996-2003"][_table_type][_t].industry_group
192
+ )
185
193
  not in _industry_exclusion_list
186
194
  ]
187
195
 
@@ -254,8 +262,8 @@ def _construct_new_period_data(
254
262
  for _table_no in _invdata_cuml[_table_type]:
255
263
  _invdata_cuml_sub_table = _invdata_cuml[_table_type][_table_no]
256
264
  _invdata_ind_group, _invdata_evid_cond, _invdata_cuml_array = (
257
- _invdata_cuml_sub_table.ind_grp,
258
- _invdata_cuml_sub_table.evid_cond,
265
+ _invdata_cuml_sub_table.industry_group,
266
+ _invdata_cuml_sub_table.additional_evidence,
259
267
  _invdata_cuml_sub_table.data_array,
260
268
  )
261
269
 
@@ -337,7 +345,7 @@ def _construct_new_period_data(
337
345
  # _invdata_array_bld_enfcls < 0, _invdata_array_bld_enfcls, 0
338
346
  # )
339
347
  # if np.einsum('ij->', invdata_array_bld_tbc):
340
- # print(
348
+ # ic(
341
349
  # f"{_data_period}, {_table_no}, {_invdata_ind_group}:",
342
350
  # abs(np.einsum('ij->', invdata_array_bld_tbc))
343
351
  # )
@@ -395,22 +403,23 @@ def _parse_invdata() -> INVData:
395
403
  by range of HHI and ∆HHI.
396
404
 
397
405
  """
406
+ raise ValueError(
407
+ "This function is defined here as documentation.\n"
408
+ "NOTE: License for `pymupdf`, upon which this function depends,"
409
+ " may be incompatible with the MIT license,"
410
+ " under which this pacakge is distributed."
411
+ " Making this fumction operable requires the user to modify"
412
+ " the source code as well as to install an additional package"
413
+ " not distributed with this package or included in its dependencies."
414
+ )
398
415
  import fitz # type: ignore
399
- # user must install pymupdf to make this function operable
400
416
 
401
- _invdata_docnames: Sequence[str] = (
402
- "040831horizmergersdata96-03.pdf",
403
- "p035603horizmergerinvestigationdata1996-2005.pdf",
404
- "081201hsrmergerdata.pdf",
405
- "130104horizontalmergerreport.pdf",
406
- )
417
+ _invdata_docnames = _download_invdata(FTCDATA_DIR)
407
418
 
408
419
  _invdata: dict[str, dict[str, dict[str, INVTableData]]] = {}
409
420
 
410
421
  for _invdata_docname in _invdata_docnames:
411
422
  _invdata_pdf_path = FTCDATA_DIR.joinpath(_invdata_docname)
412
- if not _invdata_pdf_path.is_file():
413
- _download_invdata(FTCDATA_DIR)
414
423
 
415
424
  _invdata_fitz = fitz.open(_invdata_pdf_path)
416
425
  _invdata_meta = _invdata_fitz.metadata
@@ -542,7 +551,7 @@ def _parse_table_blocks(
542
551
  _invdata_evid_cond = "Unrestricted on additional evidence"
543
552
 
544
553
  else:
545
- # print(_table_blocks)
554
+ # ic(_table_blocks)
546
555
  _invdata_evid_cond = (
547
556
  _table_blocks[1][-3].strip()
548
557
  if _table_ser == 9
@@ -561,8 +570,8 @@ def _parse_table_blocks(
561
570
 
562
571
  _table_array = process_table_func(_table_blocks)
563
572
  if not isinstance(_table_array, np.ndarray) or _table_array.dtype != np.int64:
564
- print(_table_num)
565
- print(_table_blocks)
573
+ ic(_table_num)
574
+ ic(_table_blocks)
566
575
  raise ValueError
567
576
 
568
577
  _table_data = INVTableData(_invdata_ind_group, _invdata_evid_cond, _table_array)
@@ -610,7 +619,7 @@ def _process_table_blks_conc_type(
610
619
  _col_totals = _row_array
611
620
  else:
612
621
  _invdata_array = (
613
- np.row_stack((_invdata_array, _row_array))
622
+ np.vstack((_invdata_array, _row_array))
614
623
  if _invdata_array.shape
615
624
  else _row_array
616
625
  )
@@ -657,7 +666,7 @@ def _process_table_blks_cnt_type(
657
666
  _col_totals = _row_list
658
667
  else:
659
668
  _invdata_array = (
660
- np.row_stack((_invdata_array, _row_list))
669
+ np.vstack((_invdata_array, _row_list))
661
670
  if _invdata_array.shape
662
671
  else _row_list
663
672
  )
@@ -673,27 +682,43 @@ def _process_table_blks_cnt_type(
673
682
  return _invdata_array[np.argsort(_invdata_array[:, 0])]
674
683
 
675
684
 
676
- def _download_invdata(_dl_path: Path) -> list[Any]:
685
+ def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
686
+ if not _dl_path.is_dir():
687
+ _dl_path.mkdir(parents=True)
688
+
677
689
  _invdata_homepage_urls = (
678
690
  "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
679
691
  "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
680
692
  "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
681
693
  "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
682
694
  )
683
- _invdata_docnames = []
695
+ _invdata_docnames = (
696
+ "040831horizmergersdata96-03.pdf",
697
+ "p035603horizmergerinvestigationdata1996-2005.pdf",
698
+ "081201hsrmergerdata.pdf",
699
+ "130104horizontalmergerreport.pdf",
700
+ )
701
+
702
+ if all(
703
+ _dl_path.joinpath(_invdata_docname).is_file()
704
+ for _invdata_docname in _invdata_docnames
705
+ ):
706
+ return _invdata_docnames
707
+
708
+ _invdata_docnames_dl: tuple[str, ...] = ()
684
709
  for _invdata_homepage_url in _invdata_homepage_urls:
685
710
  _invdata_soup = BeautifulSoup(
686
711
  requests.get(_invdata_homepage_url, verify=True, timeout=60).text,
687
712
  "html.parser",
688
713
  )
689
714
  _invdata_attrs = [
690
- (_g.get("href", ""), _g.get("title", ""))
715
+ (_g.get("title", ""), _g.get("href", ""))
691
716
  for _g in _invdata_soup.find_all("a")
692
717
  if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
693
718
  ]
694
719
  for _invdata_attr in _invdata_attrs:
695
- _invdata_link, _invdata_docname = _invdata_attr
696
- _invdata_docnames += [_invdata_docname]
720
+ _invdata_docname, _invdata_link = _invdata_attr
721
+ _invdata_docnames_dl += (_invdata_docname,)
697
722
  with _dl_path.joinpath(_invdata_docname).open("wb") as _invdata_fh:
698
723
  _invdata_fh.write(
699
724
  requests.get(
@@ -701,4 +726,10 @@ def _download_invdata(_dl_path: Path) -> list[Any]:
701
726
  ).content
702
727
  )
703
728
 
704
- return _invdata_docnames
729
+ return _invdata_docnames_dl
730
+
731
+
732
+ if __name__ == "__main__":
733
+ print(
734
+ "This module defines functions for downloading and preparing FTC merger investigations data for further analysis."
735
+ )