PyPI - mergeron - Versions diffs - 2024.738963.0__py3-none-any.whl → 2025.739265.0__py3-none-any.whl - Mend

mergeron 2024.738963.0py3-none-any.whl → 2025.739265.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mergeron might be problematic. Click here for more details.

Files changed (39) hide show

mergeron/__init__.py +26 -6
mergeron/core/__init__.py +5 -65
mergeron/core/{damodaran_margin_data.py → empirical_margin_distribution.py} +74 -58
mergeron/core/ftc_merger_investigations_data.py +142 -93
mergeron/core/guidelines_boundaries.py +289 -1077
mergeron/core/guidelines_boundary_functions.py +1128 -0
mergeron/core/{guidelines_boundaries_specialized_functions.py → guidelines_boundary_functions_extra.py} +76 -42
mergeron/core/pseudorandom_numbers.py +16 -22
mergeron/data/__init__.py +3 -0
mergeron/data/damodaran_margin_data.xls +0 -0
mergeron/data/damodaran_margin_data_dict.msgpack +0 -0
mergeron/demo/__init__.py +3 -0
mergeron/demo/visualize_empirical_margin_distribution.py +86 -0
mergeron/gen/__init__.py +257 -245
mergeron/gen/data_generation.py +473 -221
mergeron/gen/data_generation_functions.py +876 -0
mergeron/gen/enforcement_stats.py +355 -0
mergeron/gen/upp_tests.py +159 -259
mergeron-2025.739265.0.dist-info/METADATA +115 -0
mergeron-2025.739265.0.dist-info/RECORD +23 -0
{mergeron-2024.738963.0.dist-info → mergeron-2025.739265.0.dist-info}/WHEEL +1 -1
mergeron/License.txt +0 -16
mergeron/core/InCommon RSA Server CA cert chain.pem +0 -68
mergeron/core/excel_helper.py +0 -259
mergeron/core/proportions_tests.py +0 -520
mergeron/ext/__init__.py +0 -5
mergeron/ext/tol_colors.py +0 -851
mergeron/gen/_data_generation_functions_nonpublic.py +0 -621
mergeron/gen/investigations_stats.py +0 -709
mergeron/jinja_LaTex_templates/clrrate_cis_summary_table_template.tex.jinja2 +0 -121
mergeron/jinja_LaTex_templates/ftcinvdata_byhhianddelta_table_template.tex.jinja2 +0 -82
mergeron/jinja_LaTex_templates/ftcinvdata_summary_table_template.tex.jinja2 +0 -57
mergeron/jinja_LaTex_templates/ftcinvdata_summarypaired_table_template.tex.jinja2 +0 -104
mergeron/jinja_LaTex_templates/mergeron.cls +0 -161
mergeron/jinja_LaTex_templates/mergeron_table_collection_template.tex.jinja2 +0 -90
mergeron/jinja_LaTex_templates/setup_tikz_tables.tex.jinja2 +0 -84
mergeron-2024.738963.0.dist-info/METADATA +0 -108
mergeron-2024.738963.0.dist-info/RECORD +0 -30
/mergeron/{core → data}/ftc_invdata.msgpack +0 -0

mergeron/core/ftc_merger_investigations_data.py CHANGED Viewed

@@ -4,30 +4,29 @@ as necessary
 NOTES
 -----
-We drop reported row and column totals from source data for reducing stored data.
+Reported row and column totals from source data are not stored.
 """
 import shutil
 from collections.abc import Mapping, Sequence
-from importlib.metadata import version
+from importlib import resources
 from operator import itemgetter
 from pathlib import Path
 from types import MappingProxyType
-from typing import Any, NamedTuple, TypeAlias
+from typing import Any, NamedTuple
 import msgpack  # type: ignore
 import msgpack_numpy as m  # type: ignore
 import numpy as np
 import re2 as re  # type: ignore
-import requests
+import urllib3
 from bs4 import BeautifulSoup
 from numpy.testing import assert_array_equal
-from numpy.typing import NDArray
-from .. import _PKG_NAME, DATA_DIR  # noqa: TID252
+from .. import _PKG_NAME, DATA_DIR, VERSION, ArrayBIGINT  # noqa: TID252
-__version__ = version(_PKG_NAME)
+__version__ = VERSION
 m.patch()
@@ -36,11 +35,16 @@ if not FTCDATA_DIR.is_dir():
     FTCDATA_DIR.mkdir(parents=True)
 INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.msgpack"
-if not INVDATA_ARCHIVE_PATH.is_file():
-    if (
-        _bundled_copy := Path(__file__).parent.joinpath(INVDATA_ARCHIVE_PATH.name)
-    ).is_file():
-        shutil.copyfile(_bundled_copy, INVDATA_ARCHIVE_PATH)
+if (
+    not INVDATA_ARCHIVE_PATH.is_file()
+    and (
+        _bundled_copy := resources.files(f"{_PKG_NAME}.data").joinpath(
+            INVDATA_ARCHIVE_PATH.name
+        )
+    ).is_file()
+):
+    with resources.as_file(_bundled_copy) as _bundled_copy_path:
+        shutil.copy2(_bundled_copy_path, INVDATA_ARCHIVE_PATH)
 TABLE_NO_RE = re.compile(r"Table \d+\.\d+")
 TABLE_TYPES = ("ByHHIandDelta", "ByFirmCount")
@@ -86,16 +90,17 @@ CNT_FCOUNT_DICT = {
 class INVTableData(NamedTuple):
-    ind_grp: str
-    evid_cond: str
-    data_array: NDArray[np.int64]
+    industry_group: str
+    additional_evidence: str
+    data_array: ArrayBIGINT
-INVData: TypeAlias = Mapping[str, dict[str, dict[str, INVTableData]]]
+type INVData = Mapping[str, Mapping[str, Mapping[str, INVTableData]]]
+type _INVData_in = dict[str, dict[str, dict[str, INVTableData]]]
 def construct_data(
-    _archive_path: Path | None = None,
+    _archive_path: Path = INVDATA_ARCHIVE_PATH,
     *,
     flag_backward_compatibility: bool = True,
     flag_pharma_for_exclusion: bool = True,
@@ -130,11 +135,11 @@ def construct_data(
         A dictionary of merger investigations data keyed to reporting periods
     """
-    _archive_path = _archive_path or INVDATA_ARCHIVE_PATH
     if _archive_path.is_file() and not rebuild_data:
         _archived_data = msgpack.unpackb(_archive_path.read_bytes(), use_list=False)
-        _invdata: dict[str, dict[str, dict[str, INVTableData]]] = {}
+        _invdata: _INVData_in = {}
         for _period in _archived_data:
             _invdata[_period] = {}
             for _table_type in _archived_data[_period]:
@@ -145,7 +150,7 @@ def construct_data(
                     )
         return MappingProxyType(_invdata)
-    _invdata = dict(_parse_invdata())  # Convert immutable to mutable
+    _invdata = dict(_parse_invdata())  # type: ignore # Convert immutable to mutable
     # Add some data periods (
     #   only periods ending in 2011, others have few observations and
@@ -161,7 +166,7 @@ def construct_data(
     # Create data for industries with no evidence on entry
     for _data_period in _invdata:
-        _construct_no_entry_evidence_data(_invdata, _data_period)
+        _construct_no_evidence_data(_invdata, _data_period)
     # Create a list of exclusions to named industries in the base period,
     #   for construction of aggregate enforcement statistics where feasible
@@ -181,7 +186,9 @@ def construct_data(
             _aggr_tables_list = [
                 _t
                 for _t in _invdata["1996-2003"][_table_type]
-                if re.sub(r"\W", "", _invdata["1996-2003"][_table_type][_t].ind_grp)
+                if re.sub(
+                    r"\W", "", _invdata["1996-2003"][_table_type][_t].industry_group
+                )
                 not in _industry_exclusion_list
             ]
@@ -191,42 +198,54 @@ def construct_data(
                 )
             }
-    _ = INVDATA_ARCHIVE_PATH.write_bytes(msgpack.packb(_invdata))
+    _ = INVDATA_ARCHIVE_PATH.write_bytes(msgpack.packb(_invdata))  # pyright: ignore
     return MappingProxyType(_invdata)
-def _construct_no_entry_evidence_data(_invdata: INVData, _data_period: str, /) -> None:
+def _construct_no_evidence_data(_invdata: _INVData_in, _data_period: str, /) -> None:
     _invdata_ind_grp = "All Markets"
-    _invdata_evid_cond = "No Entry Evidence"
-    _invdata_sub_evid_cond_conc = _invdata[_data_period]["ByHHIandDelta"]
-    _invdata_sub_evid_cond_conc["Table 9.X"] = INVTableData(
-        _invdata_ind_grp,
-        _invdata_evid_cond,
-        np.column_stack((
-            _invdata_sub_evid_cond_conc["Table 3.1"].data_array[:, :2],
+    _table_nos_map = dict(
+        zip(
             (
-                _invdata_sub_evid_cond_conc["Table 3.1"].data_array[:, 2:]
-                - _invdata_sub_evid_cond_conc["Table 9.1"].data_array[:, 2:]
-                - _invdata_sub_evid_cond_conc["Table 9.2"].data_array[:, 2:]
+                "No Entry Evidence",
+                "No Evidence on Customer Complaints",
+                "No Evidence on Hot Documents",
             ),
-        )),
-    )
-    _invdata_sub_evid_cond_fcount = _invdata[_data_period]["ByFirmCount"]
-    _invdata_sub_evid_cond_fcount["Table 10.X"] = INVTableData(
-        _invdata_ind_grp,
-        _invdata_evid_cond,
-        np.column_stack((
-            _invdata_sub_evid_cond_fcount["Table 4.1"].data_array[:, :1],
             (
-                _invdata_sub_evid_cond_fcount["Table 4.1"].data_array[:, 1:]
-                - _invdata_sub_evid_cond_fcount["Table 10.1"].data_array[:, 1:]
-                - _invdata_sub_evid_cond_fcount["Table 10.2"].data_array[:, 1:]
+                {"ByHHIandDelta": "Table 9.X", "ByFirmCount": "Table 10.X"},
+                {"ByHHIandDelta": "Table 7.X", "ByFirmCount": "Table 8.X"},
+                {"ByHHIandDelta": "Table 5.X", "ByFirmCount": "Table 6.X"},
             ),
-        )),
+            strict=True,
+        )
     )
+    for _invdata_evid_cond in (
+        "No Entry Evidence",
+        "No Evidence on Customer Complaints",
+        "No Evidence on Hot Documents",
+    ):
+        for _stats_grp in ("ByHHIandDelta", "ByFirmCount"):
+            _invdata_sub_evid_cond_conc = _invdata[_data_period][_stats_grp]
+            _dtn = _table_nos_map[_invdata_evid_cond]["ByHHIandDelta"]
+            _stn0 = "Table 4.1" if _stats_grp == "ByFirmCount" else "Table 3.1"
+            _stn1, _stn2 = (_dtn.replace(".X", f".{_i}") for _i in ("1", "2"))
+            _invdata_sub_evid_cond_conc |= {
+                _dtn: INVTableData(
+                    _invdata_ind_grp,
+                    _invdata_evid_cond,
+                    np.column_stack((
+                        _invdata_sub_evid_cond_conc[_stn0].data_array[:, :2],
+                        (
+                            _invdata_sub_evid_cond_conc[_stn0].data_array[:, 2:]
+                            - _invdata_sub_evid_cond_conc[_stn1].data_array[:, 2:]
+                            - _invdata_sub_evid_cond_conc[_stn2].data_array[:, 2:]
+                        ),
+                    )),
+                )
+            }
 def _construct_new_period_data(
@@ -254,8 +273,8 @@ def _construct_new_period_data(
         for _table_no in _invdata_cuml[_table_type]:
             _invdata_cuml_sub_table = _invdata_cuml[_table_type][_table_no]
             _invdata_ind_group, _invdata_evid_cond, _invdata_cuml_array = (
-                _invdata_cuml_sub_table.ind_grp,
-                _invdata_cuml_sub_table.evid_cond,
+                _invdata_cuml_sub_table.industry_group,
+                _invdata_cuml_sub_table.additional_evidence,
                 _invdata_cuml_sub_table.data_array,
             )
@@ -279,7 +298,7 @@ def _construct_new_period_data(
                 # Consistency here means that the number of investigations reported
                 # in each period is no less than the number reported in
                 # any prior period.Although the time periods for table 3.2 through 3.5
-                # are not the samein the data for 1996-2005 and 1996-2007 as in
+                # are not the same in the data for 1996-2005 and 1996-2007 as in
                 # the data for the other periods, they are nonetheless shorter than
                 # the period 1996-2011, and hence the counts reported for 1996-2011
                 # cannot be less than those reported in these prior periods. Note that
@@ -331,7 +350,8 @@ def _construct_new_period_data(
                     _invdata_cuml_array[:, -3:-1] - _invdata_base_array[:, -3:-1]  # type: ignore
                 )
-                # To examine the number of corrected values per table,
+                # # // spellchecker: disable
+                # To examine the number of corrected values per table,  // spellchecker: disable
                 # uncomment the statements below
                 # _invdata_array_bld_tbc = where(
                 #   _invdata_array_bld_enfcls < 0, _invdata_array_bld_enfcls, 0
@@ -341,6 +361,7 @@ def _construct_new_period_data(
                 #       f"{_data_period}, {_table_no}, {_invdata_ind_group}:",
                 #       abs(np.einsum('ij->', invdata_array_bld_tbc))
                 #       )
+                # #  // spellchecker: disable
                 # Enforce non-negativity
                 _invdata_array_bld_enfcls = np.stack((
@@ -395,22 +416,23 @@ def _parse_invdata() -> INVData:
         by range of HHI and ∆HHI.
     """
+    raise ValueError(
+        "This function is defined here as documentation.\n"
+        "NOTE: License for `pymupdf`, upon which this function depends,"
+        " may be incompatible with the MIT license,"
+        " under which this pacakge is distributed."
+        " Making this fumction operable requires the user to modify"
+        " the source code as well as to install an additional package"
+        " not distributed with this package or included in its dependencies."
+    )
     import fitz  # type: ignore
-    # user must install pymupdf to make this function operable
-    _invdata_docnames: Sequence[str] = (
-        "040831horizmergersdata96-03.pdf",
-        "p035603horizmergerinvestigationdata1996-2005.pdf",
-        "081201hsrmergerdata.pdf",
-        "130104horizontalmergerreport.pdf",
-    )
+    _invdata_docnames = _download_invdata(FTCDATA_DIR)
     _invdata: dict[str, dict[str, dict[str, INVTableData]]] = {}
     for _invdata_docname in _invdata_docnames:
         _invdata_pdf_path = FTCDATA_DIR.joinpath(_invdata_docname)
-        if not _invdata_pdf_path.is_file():
-            _download_invdata(FTCDATA_DIR)
         _invdata_fitz = fitz.open(_invdata_pdf_path)
         _invdata_meta = _invdata_fitz.metadata
@@ -475,7 +497,7 @@ def _parse_invdata() -> INVData:
 def _parse_page_blocks(
-    _invdata: INVData, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
+    _invdata: _INVData_in, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
 ) -> None:
     if _data_period != "1996-2011":
         _parse_table_blocks(_invdata, _data_period, _doc_pg_blocks)
@@ -502,7 +524,7 @@ def _parse_page_blocks(
 def _parse_table_blocks(
-    _invdata: INVData, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
+    _invdata: _INVData_in, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
 ) -> None:
     _invdata_evid_cond = "Unrestricted on additional evidence"
     _table_num, _table_ser, _table_type = _identify_table_type(
@@ -577,12 +599,12 @@ def _identify_table_type(_tnstr: str = CONC_TABLE_ALL, /) -> tuple[str, int, str
 def _process_table_blks_conc_type(
     _table_blocks: Sequence[Sequence[str]], /
-) -> NDArray[np.int64]:
+) -> ArrayBIGINT:
     _conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
     _col_titles_array = tuple(CONC_DELTA_DICT.values())
-    _col_totals: NDArray[np.int64] = np.zeros(len(_col_titles_array), np.int64)
-    _invdata_array: NDArray[np.int64] = np.array(None)
+    _col_totals: ArrayBIGINT = np.zeros(len(_col_titles_array), np.int64)
+    _invdata_array: ArrayBIGINT = np.array(None)
     for _tbl_blk in _table_blocks:
         if _conc_row_pat.match(_blk_str := _tbl_blk[-3]):
@@ -610,7 +632,7 @@ def _process_table_blks_conc_type(
                 _col_totals = _row_array
             else:
                 _invdata_array = (
-                    np.row_stack((_invdata_array, _row_array))
+                    np.vstack((_invdata_array, _row_array))
                     if _invdata_array.shape
                     else _row_array
                 )
@@ -634,13 +656,11 @@ def _process_table_blks_conc_type(
 def _process_table_blks_cnt_type(
     _table_blocks: Sequence[Sequence[str]], /
-) -> NDArray[np.int64]:
+) -> ArrayBIGINT:
     _cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
-    _invdata_array: NDArray[np.int64] = np.array(None)
-    _col_totals: NDArray[np.int64] = np.zeros(
-        3, np.int64
-    )  # "enforced", "closed", "total"
+    _invdata_array: ArrayBIGINT = np.array(None)
+    _col_totals: ArrayBIGINT = np.zeros(3, np.int64)  # "enforced", "closed", "total"
     for _tbl_blk in _table_blocks:
         if _cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
@@ -657,7 +677,7 @@ def _process_table_blks_cnt_type(
                 _col_totals = _row_list
             else:
                 _invdata_array = (
-                    np.row_stack((_invdata_array, _row_list))
+                    np.vstack((_invdata_array, _row_list))
                     if _invdata_array.shape
                     else _row_list
                 )
@@ -673,32 +693,61 @@ def _process_table_blks_cnt_type(
     return _invdata_array[np.argsort(_invdata_array[:, 0])]
-def _download_invdata(_dl_path: Path) -> list[Any]:
+def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
+    if not _dl_path.is_dir():
+        _dl_path.mkdir(parents=True)
     _invdata_homepage_urls = (
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
     )
-    _invdata_docnames = []
+    _invdata_docnames = (
+        "040831horizmergersdata96-03.pdf",
+        "p035603horizmergerinvestigationdata1996-2005.pdf",
+        "081201hsrmergerdata.pdf",
+        "130104horizontalmergerreport.pdf",
+    )
+    if all(
+        _dl_path.joinpath(_invdata_docname).is_file()
+        for _invdata_docname in _invdata_docnames
+    ):
+        return _invdata_docnames
+    _invdata_docnames_dl: tuple[str, ...] = ()
+    _u3pm = urllib3.PoolManager()
+    _chunk_size = 1024 * 1024
     for _invdata_homepage_url in _invdata_homepage_urls:
-        _invdata_soup = BeautifulSoup(
-            requests.get(_invdata_homepage_url, verify=True, timeout=60).text,
-            "html.parser",
-        )
-        _invdata_attrs = [
-            (_g.get("href", ""), _g.get("title", ""))
-            for _g in _invdata_soup.find_all("a")
-            if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
-        ]
+        with _u3pm.request(
+            "GET", _invdata_homepage_url, preload_content=False
+        ) as _u3handle:
+            _invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
+            _invdata_attrs = [
+                (_g.get("title", ""), _g.get("href", ""))
+                for _g in _invdata_soup.find_all("a")
+                if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
+            ]
         for _invdata_attr in _invdata_attrs:
-            _invdata_link, _invdata_docname = _invdata_attr
-            _invdata_docnames += [_invdata_docname]
-            with _dl_path.joinpath(_invdata_docname).open("wb") as _invdata_fh:
-                _invdata_fh.write(
-                    requests.get(
-                        f"https://www.ftc.gov/{_invdata_link}", verify=True, timeout=60
-                    ).content
-                )
+            _invdata_docname, _invdata_link = _invdata_attr
+            _invdata_docnames_dl += (_invdata_docname,)
+            with (
+                _u3pm.request(
+                    "GET", f"https://www.ftc.gov/{_invdata_link}", preload_content=False
+                ) as _urlopen_handle,
+                _dl_path.joinpath(_invdata_docname).open("wb") as _invdata_fh,
+            ):
+                while True:
+                    _data = _urlopen_handle.read(_chunk_size)
+                    if not _data:
+                        break
+                    _invdata_fh.write(_data)
+    return _invdata_docnames_dl
-    return _invdata_docnames
+if __name__ == "__main__":
+    print(
+        "This module defines functions for downloading and preparing FTC merger investigations data for further analysis."
+    )

mergeron 2024.738963.0__py3-none-any.whl → 2025.739265.0__py3-none-any.whl

Potentially problematic release.

mergeron 2024.738963.0py3-none-any.whl → 2025.739265.0py3-none-any.whl