PyPI - mergeron - Versions diffs - 2025.739290.3__py3-none-any.whl → 2025.739290.4__py3-none-any.whl - Mend

mergeron 2025.739290.3py3-none-any.whl → 2025.739290.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mergeron might be problematic. Click here for more details.

Files changed (21) hide show

mergeron/__init__.py +74 -48
mergeron/core/__init__.py +105 -4
mergeron/core/empirical_margin_distribution.py +100 -78
mergeron/core/ftc_merger_investigations_data.py +309 -316
mergeron/core/guidelines_boundaries.py +62 -121
mergeron/core/guidelines_boundary_functions.py +207 -384
mergeron/core/guidelines_boundary_functions_extra.py +264 -104
mergeron/core/pseudorandom_numbers.py +76 -67
mergeron/data/damodaran_margin_data_serialized.zip +0 -0
mergeron/data/ftc_invdata.zip +0 -0
mergeron/demo/visualize_empirical_margin_distribution.py +9 -7
mergeron/gen/__init__.py +123 -161
mergeron/gen/data_generation.py +183 -149
mergeron/gen/data_generation_functions.py +220 -237
mergeron/gen/enforcement_stats.py +83 -115
mergeron/gen/upp_tests.py +118 -193
{mergeron-2025.739290.3.dist-info → mergeron-2025.739290.4.dist-info}/METADATA +2 -3
mergeron-2025.739290.4.dist-info/RECORD +24 -0
{mergeron-2025.739290.3.dist-info → mergeron-2025.739290.4.dist-info}/WHEEL +1 -1
mergeron/data/damodaran_margin_data_dict.msgpack +0 -0
mergeron-2025.739290.3.dist-info/RECORD +0 -23

mergeron/core/ftc_merger_investigations_data.py CHANGED Viewed

@@ -10,25 +10,37 @@ Reported row and column totals from source data are not stored.
 from __future__ import annotations
+import re
 import shutil
-from collections.abc import Mapping, Sequence
-from dataclasses import dataclass
+from collections.abc import Sequence
 from importlib import resources
 from operator import itemgetter
 from pathlib import Path
 from types import MappingProxyType
 from typing import Any
+from zipfile import ZIP_DEFLATED, ZipFile
-import msgpack  # type: ignore
 import msgpack_numpy as m  # type: ignore
 import numpy as np
-import re2 as re  # type: ignore
 import urllib3
 from bs4 import BeautifulSoup
 from numpy.testing import assert_array_equal
-from ruamel import yaml
-from .. import _PKG_NAME, DATA_DIR, EMPTY_ARRAYINT, VERSION, ArrayBIGINT  # noqa: TID252
+from .. import (  # noqa: TID252
+    _PKG_NAME,
+    DATA_DIR,
+    EMPTY_ARRAYINT,
+    VERSION,
+    ArrayBIGINT,
+    this_yaml,
+)
+from . import (
+    INVData,
+    INVData_in_,
+    INVTableData,
+    _dict_from_mapping,
+    _mappingproxy_from_mapping,
+)
 __version__ = VERSION
@@ -38,7 +50,7 @@ FTCDATA_DIR = DATA_DIR / "FTCData"
 if not FTCDATA_DIR.is_dir():
     FTCDATA_DIR.mkdir(parents=True)
-INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.msgpack"
+INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.zip"
 if (
     not INVDATA_ARCHIVE_PATH.is_file()
     and (
@@ -93,32 +105,6 @@ CNT_FCOUNT_DICT = {
 }
-@dataclass(slots=True, frozen=True)
-class INVTableData:
-    industry_group: str
-    additional_evidence: str
-    data_array: ArrayBIGINT
-    @classmethod
-    def to_yaml(
-        cls, _r: yaml.representer.SafeRepresenter, _d: INVTableData
-    ) -> yaml.MappingNode:
-        _ret: yaml.MappingNode = _r.represent_mapping(
-            f"!{cls.__name__}", {_a: getattr(_d, _a) for _a in _d.__dataclass_fields__}
-        )
-        return _ret
-    @classmethod
-    def from_yaml(
-        cls, _c: yaml.constructor.SafeConstructor, _n: yaml.MappingNode
-    ) -> INVTableData:
-        return cls(**_c.construct_mapping(_n))
-type INVData = Mapping[str, Mapping[str, Mapping[str, INVTableData]]]
-type _INVData_in = dict[str, dict[str, dict[str, INVTableData]]]
 def construct_data(
     _archive_path: Path = INVDATA_ARCHIVE_PATH,
     *,
@@ -157,75 +143,83 @@ def construct_data(
     """
     if _archive_path.is_file() and not rebuild_data:
-        _archived_data = msgpack.unpackb(_archive_path.read_bytes(), use_list=False)
-        _invdata: _INVData_in = {}
-        for _period in _archived_data:
-            _invdata[_period] = {}
-            for _table_type in _archived_data[_period]:
-                _invdata[_period][_table_type] = {}
-                for _table_no in _archived_data[_period][_table_type]:
-                    _invdata[_period][_table_type][_table_no] = INVTableData(
-                        *_archived_data[_period][_table_type][_table_no]
-                    )
-        return MappingProxyType(_invdata)
+        with (
+            ZipFile(_archive_path, "r") as _yzh,
+            _yzh.open(f"{_archive_path.stem}.yaml", "r") as _yfh,
+        ):
+            invdata_ = this_yaml.load(_yfh)
+        if isinstance(invdata_, MappingProxyType):
+            invdata_ = _mappingproxy_from_mapping(invdata_)
+            with (
+                ZipFile(_archive_path, "w", compression=ZIP_DEFLATED) as _yzh,
+                _yzh.open(f"{_archive_path.stem}.yaml", "w") as _yfh,
+            ):
+                this_yaml.dump(invdata_, _yfh)
+        return invdata_
-    _invdata = dict(_parse_invdata())  # type: ignore # Convert immutable to mutable
+    invdata: INVData_in_ = _dict_from_mapping(_parse_invdata())
     # Add some data periods (
     #   only periods ending in 2011, others have few observations and
     #   some incompatibilities
     #   )
-    for _data_period in "2004-2011", "2006-2011", "2008-2011":
-        _invdata_bld = _construct_new_period_data(
-            _invdata,
-            _data_period,
+    for data_period in "2004-2011", "2006-2011", "2008-2011":
+        invdata_bld = _construct_new_period_data(
+            invdata,
+            data_period,
             flag_backward_compatibility=flag_backward_compatibility,
         )
-        _invdata |= {_data_period: _invdata_bld}
+        invdata |= {data_period: invdata_bld}
     # Create data for industries with no evidence on entry
-    for _data_period in _invdata:
-        _construct_no_evidence_data(_invdata, _data_period)
+    for data_period in invdata:
+        _construct_no_evidence_data(invdata, data_period)
     # Create a list of exclusions to named industries in the base period,
     #   for construction of aggregate enforcement statistics where feasible
-    _industry_exclusion_list = (
+    industry_exclusion_list = {
         "AllMarkets",
         "OtherMarkets",
         "IndustriesinCommon",
         "",
         ("PharmaceuticalsMarkets" if flag_pharma_for_exclusion else None),
-    )
-    for _data_period in "1996-2003", "1996-2011", "2004-2011":
-        for _table_type, _table_no in zip(
+    }
+    # Construct aggregate tables
+    for data_period in "1996-2003", "1996-2011", "2004-2011":
+        for table_type, table_no in zip(
             TABLE_TYPES, (CONC_TABLE_ALL, CNT_TABLE_ALL), strict=True
         ):
-            _invdata_sub_tabletype = _invdata[_data_period][_table_type]
+            invdata_sub_tabletype = invdata[data_period][table_type]
-            _aggr_tables_list = [
-                _t
-                for _t in _invdata["1996-2003"][_table_type]
+            aggr_tables_list = [
+                t_
+                for t_ in invdata["1996-2003"][table_type]
                 if re.sub(
-                    r"\W", "", _invdata["1996-2003"][_table_type][_t].industry_group
+                    r"\W", "", invdata["1996-2003"][table_type][t_].industry_group
                 )
-                not in _industry_exclusion_list
+                not in industry_exclusion_list
             ]
-            _invdata_sub_tabletype |= {
-                _table_no.replace(".1", ".X"): _invdata_build_aggregate_table(
-                    _invdata_sub_tabletype, _aggr_tables_list
+            invdata_sub_tabletype |= {
+                table_no.replace(".1", ".X"): invdata_build_aggregate_table(
+                    invdata_sub_tabletype, aggr_tables_list
                 )
             }
-    _ = INVDATA_ARCHIVE_PATH.write_bytes(msgpack.packb(_invdata))
+    retval: INVData = _mappingproxy_from_mapping(invdata)
+    with (
+        ZipFile(_archive_path, "w", compression=ZIP_DEFLATED) as _yzh,
+        _yzh.open(f"{_archive_path.stem}.yaml", "w") as _yfh,
+    ):
+        this_yaml.dump(retval, _yfh)
-    return MappingProxyType(_invdata)
+    return retval
-def _construct_no_evidence_data(_invdata: _INVData_in, _data_period: str, /) -> None:
-    _invdata_ind_grp = "All Markets"
-    _table_nos_map = dict(
+def _construct_no_evidence_data(_invdata: INVData_in_, _data_period: str, /) -> None:
+    invdata_ind_grp = "All Markets"
+    table_nos_map = dict(
         zip(
             (
                 "No Entry Evidence",
@@ -240,28 +234,28 @@ def _construct_no_evidence_data(_invdata: _INVData_in, _data_period: str, /) ->
             strict=True,
         )
     )
-    for _invdata_evid_cond in (
+    for invdata_evid_cond in (
         "No Entry Evidence",
         "No Evidence on Customer Complaints",
         "No Evidence on Hot Documents",
     ):
-        for _stats_grp in ("ByHHIandDelta", "ByFirmCount"):
-            _invdata_sub_evid_cond_conc = _invdata[_data_period][_stats_grp]
+        for stats_grp in ("ByHHIandDelta", "ByFirmCount"):
+            invdata_sub_evid_cond_conc = _invdata[_data_period][stats_grp]
-            _dtn = _table_nos_map[_invdata_evid_cond]["ByHHIandDelta"]
-            _stn0 = "Table 4.1" if _stats_grp == "ByFirmCount" else "Table 3.1"
-            _stn1, _stn2 = (_dtn.replace(".X", f".{_i}") for _i in ("1", "2"))
+            dtn = table_nos_map[invdata_evid_cond]["ByHHIandDelta"]
+            stn0 = "Table 4.1" if stats_grp == "ByFirmCount" else "Table 3.1"
+            stn1, stn2 = (dtn.replace(".X", f".{_i}") for _i in ("1", "2"))
-            _invdata_sub_evid_cond_conc |= {
-                _dtn: INVTableData(
-                    _invdata_ind_grp,
-                    _invdata_evid_cond,
+            invdata_sub_evid_cond_conc |= {
+                dtn: INVTableData(
+                    invdata_ind_grp,
+                    invdata_evid_cond,
                     np.column_stack((
-                        _invdata_sub_evid_cond_conc[_stn0].data_array[:, :2],
+                        invdata_sub_evid_cond_conc[stn0].data_array[:, :2],
                         (
-                            _invdata_sub_evid_cond_conc[_stn0].data_array[:, 2:]
-                            - _invdata_sub_evid_cond_conc[_stn1].data_array[:, 2:]
-                            - _invdata_sub_evid_cond_conc[_stn2].data_array[:, 2:]
+                            invdata_sub_evid_cond_conc[stn0].data_array[:, 2:]
+                            - invdata_sub_evid_cond_conc[stn1].data_array[:, 2:]
+                            - invdata_sub_evid_cond_conc[stn2].data_array[:, 2:]
                         ),
                     )),
                 )
@@ -275,44 +269,44 @@ def _construct_new_period_data(
     *,
     flag_backward_compatibility: bool = False,
 ) -> dict[str, dict[str, INVTableData]]:
-    _cuml_period = "1996-{}".format(int(_data_period.split("-")[1]))
-    if _cuml_period != "1996-2011":
+    cuml_period = f"1996-{_data_period.split('-')[1]}"
+    if cuml_period != "1996-2011":
         raise ValueError('Expected cumulative period, "1996-2011"')
-    _invdata_cuml = _invdata[_cuml_period]
+    invdata_cuml = _invdata[cuml_period]
-    _base_period = "1996-{}".format(int(_data_period.split("-")[0]) - 1)
-    _invdata_base = _invdata[_base_period]
+    base_period = "1996-{}".format(int(_data_period.split("-")[0]) - 1)
+    invdata_base = _invdata[base_period]
-    if tuple(_invdata_cuml.keys()) != TABLE_TYPES:
+    if tuple(invdata_cuml.keys()) != TABLE_TYPES:
         raise ValueError("Source data does not include the expected groups of tables.")
-    _invdata_bld = {}
-    for _table_type in TABLE_TYPES:
-        _data_typesubdict = {}
-        for _table_no in _invdata_cuml[_table_type]:
-            _invdata_cuml_sub_table = _invdata_cuml[_table_type][_table_no]
-            _invdata_ind_group, _invdata_evid_cond, _invdata_cuml_array = (
-                _invdata_cuml_sub_table.industry_group,
-                _invdata_cuml_sub_table.additional_evidence,
-                _invdata_cuml_sub_table.data_array,
+    invdata_bld = {}
+    for table_type in TABLE_TYPES:
+        data_typesubdict = {}
+        for table_no in invdata_cuml[table_type]:
+            invdata_cuml_sub_table = invdata_cuml[table_type][table_no]
+            invdata_ind_group, invdata_evid_cond, invdata_cuml_array = (
+                invdata_cuml_sub_table.industry_group,
+                invdata_cuml_sub_table.additional_evidence,
+                invdata_cuml_sub_table.data_array,
             )
-            _invdata_base_sub_table = _invdata_base[_table_type].get(
-                _table_no, INVTableData("", "", EMPTY_ARRAYINT)
+            invdata_base_sub_table = invdata_base[table_type].get(
+                table_no, INVTableData("", "", EMPTY_ARRAYINT)
             )
-            (_invdata_base_ind_group, _invdata_base_evid_cond, _invdata_base_array) = (
-                getattr(_invdata_base_sub_table, _a)
+            (invdata_base_ind_group, invdata_base_evid_cond, invdata_base_array) = (
+                getattr(invdata_base_sub_table, _a)
                 for _a in ("industry_group", "additional_evidence", "data_array")
             )
             # Some tables can't be constructed due to inconsistencies in the data
             # across time periods
             if (
-                (_data_period != "2004-2011" and _invdata_ind_group != "All Markets")
-                or (_invdata_ind_group in ('"Other" Markets', "Industries in Common"))
-                or (_invdata_base_ind_group in ('"Other" Markets', ""))
+                (_data_period != "2004-2011" and invdata_ind_group != "All Markets")
+                or (invdata_ind_group in {'"Other" Markets', "Industries in Common"})
+                or (invdata_base_ind_group in {'"Other" Markets', ""})
             ):
                 continue
@@ -328,31 +322,29 @@ def _construct_new_period_data(
                 # The number of "revisions" applied below, for enforcing consistency,
                 # is sufficiently small as to be unlikely to substantially impact
                 # results from analysis of the data.
-                _invdata_cuml_array_stack = []
-                _invdata_base_array_stack = []
-                for _data_period_detail in _invdata:
-                    _pd_start, _pd_end = (
-                        int(g) for g in _data_period_detail.split("-")
-                    )
-                    if _pd_start == 1996:
-                        _invdata_cuml_array_stack += [
-                            _invdata[_data_period_detail][_table_type][
-                                _table_no
+                invdata_cuml_array_stack = []
+                invdata_base_array_stack = []
+                for data_period_detail in _invdata:
+                    pd_start, pd_end = (int(g) for g in data_period_detail.split("-"))
+                    if pd_start == 1996:
+                        invdata_cuml_array_stack += [
+                            _invdata[data_period_detail][table_type][
+                                table_no
                             ].data_array[:, -3:-1]
                         ]
-                    if _pd_start == 1996 and _pd_end < int(_data_period.split("-")[0]):
-                        _invdata_base_array_stack += [
-                            _invdata[_data_period_detail][_table_type][
-                                _table_no
+                    if pd_start == 1996 and pd_end < int(_data_period.split("-")[0]):
+                        invdata_base_array_stack += [
+                            _invdata[data_period_detail][table_type][
+                                table_no
                             ].data_array[:, -3:-1]
                         ]
-                _invdata_cuml_array_enfcls, _invdata_base_array_enfcls = (
+                invdata_cuml_array_enfcls, invdata_base_array_enfcls = (
                     np.stack(_f).max(axis=0)
-                    for _f in (_invdata_cuml_array_stack, _invdata_base_array_stack)
+                    for _f in (invdata_cuml_array_stack, invdata_base_array_stack)
                 )
-                _invdata_array_bld_enfcls = (
-                    _invdata_cuml_array_enfcls - _invdata_base_array_enfcls
+                invdata_array_bld_enfcls = (
+                    invdata_cuml_array_enfcls - invdata_base_array_enfcls
                 )
             else:
                 # Consistency here means that the most recent data are considered
@@ -369,59 +361,59 @@ def _construct_new_period_data(
                 # backward compatible due to minor variation in (applying) the criteria
                 # for inclusion, as well as industry coding, undertaken to maintain
                 # transparency on the enforcement process.
-                _invdata_array_bld_enfcls = (
-                    _invdata_cuml_array[:, -3:-1] - _invdata_base_array[:, -3:-1]  # type: ignore
+                invdata_array_bld_enfcls = (
+                    invdata_cuml_array[:, -3:-1] - invdata_base_array[:, -3:-1]
                 )
                 # # // spellchecker: disable
                 # To examine the number of corrected values per table,  // spellchecker: disable
                 # uncomment the statements below
-                # _invdata_array_bld_tbc = where(
-                #   _invdata_array_bld_enfcls < 0, _invdata_array_bld_enfcls, 0
+                # invdata_array_bld_tbc = where(
+                #   invdata_array_bld_enfcls < 0, invdata_array_bld_enfcls, 0
                 # )
                 # if np.einsum('ij->', invdata_array_bld_tbc):
                 #     print(
-                #       f"{_data_period}, {_table_no}, {_invdata_ind_group}:",
+                #       f"{_data_period}, {_table_no}, {invdata_ind_group}:",
                 #       abs(np.einsum('ij->', invdata_array_bld_tbc))
                 #       )
                 # #  // spellchecker: disable
                 # Enforce non-negativity
-                _invdata_array_bld_enfcls = np.stack((
-                    _invdata_array_bld_enfcls,
-                    np.zeros_like(_invdata_array_bld_enfcls),
+                invdata_array_bld_enfcls = np.stack((
+                    invdata_array_bld_enfcls,
+                    np.zeros_like(invdata_array_bld_enfcls),
                 )).max(axis=0)
-            _invdata_array_bld = np.column_stack((
-                _invdata_cuml_array[:, :-3],
-                _invdata_array_bld_enfcls,
-                np.einsum("ij->i", _invdata_array_bld_enfcls),
+            invdata_array_bld = np.column_stack((
+                invdata_cuml_array[:, :-3],
+                invdata_array_bld_enfcls,
+                np.einsum("ij->i", invdata_array_bld_enfcls),
             ))
-            _data_typesubdict[_table_no] = INVTableData(
-                _invdata_ind_group, _invdata_evid_cond, _invdata_array_bld
+            data_typesubdict[table_no] = INVTableData(
+                invdata_ind_group, invdata_evid_cond, invdata_array_bld
             )
-            del _invdata_ind_group, _invdata_evid_cond, _invdata_cuml_array
-            del _invdata_base_ind_group, _invdata_base_evid_cond, _invdata_base_array
-            del _invdata_array_bld
-        _invdata_bld[_table_type] = _data_typesubdict
-    return _invdata_bld
+            del invdata_ind_group, invdata_evid_cond, invdata_cuml_array
+            del invdata_base_ind_group, invdata_base_evid_cond, invdata_base_array
+            del invdata_array_bld
+        invdata_bld[table_type] = data_typesubdict
+    return invdata_bld
-def _invdata_build_aggregate_table(
+def invdata_build_aggregate_table(
     _data_typesub: dict[str, INVTableData], _aggr_table_list: Sequence[str]
 ) -> INVTableData:
-    _hdr_table_no = _aggr_table_list[0]
+    hdr_table_no = _aggr_table_list[0]
     return INVTableData(
         "Industries in Common",
         "Unrestricted on additional evidence",
         np.column_stack((
-            _data_typesub[_hdr_table_no].data_array[:, :-3],
+            _data_typesub[hdr_table_no].data_array[:, :-3],
             np.einsum(
                 "ijk->jk",
                 np.stack([
-                    (_data_typesub[_t]).data_array[:, -3:] for _t in _aggr_table_list
+                    (_data_typesub[t_]).data_array[:, -3:] for t_ in _aggr_table_list
                 ]),
             ),
         )),
@@ -439,294 +431,295 @@ def _parse_invdata() -> INVData:
         by range of HHI and ∆HHI.
     """
-    raise ValueError(
-        "This function is defined here as documentation.\n"
-        "NOTE: License for `pymupdf`, upon which this function depends,"
-        " may be incompatible with the MIT license,"
-        " under which this pacakge is distributed."
-        " Making this fumction operable requires the user to modify"
-        " the source code as well as to install an additional package"
-        " not distributed with this package or included in its dependencies."
-    )
-    import fitz  # type: ignore
-    _invdata_docnames = _download_invdata(FTCDATA_DIR)
-    _invdata: dict[str, dict[str, dict[str, INVTableData]]] = {}
-    for _invdata_docname in _invdata_docnames:
-        _invdata_pdf_path = FTCDATA_DIR.joinpath(_invdata_docname)
-        _invdata_fitz = fitz.open(_invdata_pdf_path)
-        _invdata_meta = _invdata_fitz.metadata
-        if _invdata_meta["title"] == " ":
-            _invdata_meta["title"] = ", ".join((
+    # raise ValueError(
+    #     "This function is defined here as documentation.\n"
+    #     "NOTE: License for `pymupdf`, upon which this function depends,"
+    #     " may be incompatible with the MIT license,"
+    #     " under which this pacakge is distributed."
+    #     " Making this fumction operable requires the user to modify"
+    #     " the source code as well as to install an additional package"
+    #     " not distributed with this package or identified as a requirement."
+    # )
+    import pymupdf  # type: ignore  # noqa: PLC0415
+    invdata_docnames = _download_invdata(FTCDATA_DIR)
+    invdata: INVData_in_ = {}
+    for invdata_docname in invdata_docnames:
+        invdata_pdf_path = FTCDATA_DIR.joinpath(invdata_docname)
+        invdata_doc = pymupdf.open(invdata_pdf_path)
+        invdata_meta = invdata_doc.metadata
+        if invdata_meta["title"] == " ":
+            invdata_meta["title"] = ", ".join((
                 "Horizontal Merger Investigation Data",
                 "Fiscal Years",
                 "1996-2005",
             ))
-        _data_period = re.findall(r"(\d{4}) *(-) *(\d{4})", _invdata_meta["title"])[0]
-        _data_period = "".join(_data_period)
+        data_period = "".join(  # line-break here for readability
+            re.findall(r"(\d{4}) *(-) *(\d{4})", invdata_meta["title"])[0]
+        )
         # Initialize containers for parsed data
-        _invdata[_data_period] = {k: {} for k in TABLE_TYPES}
+        invdata[data_period] = {k: {} for k in TABLE_TYPES}
-        for _pdf_pg in _invdata_fitz.pages():
-            _doc_pg_blocks = _pdf_pg.get_text("blocks", sort=False)
+        for pdf_pg in invdata_doc.pages():
+            doc_pg_blocks = pdf_pg.get_text("blocks", sort=False)
             # Across all published reports of FTC investigations data,
             #   sorting lines (PDF page blocks) by the lower coordinates
             #   and then the left coordinates is most effective for
             #   ordering table rows in top-to-bottom order; this doesn't
             #   work for the 1996-2005 data, however, so we resort later
-            _doc_pg_blocks = sorted([
+            doc_pg_blocks = sorted([
                 (f"{_f[3]:03.0f}{_f[0]:03.0f}{_f[1]:03.0f}{_f[2]:03.0f}", *_f)
-                for _f in _doc_pg_blocks
+                for _f in doc_pg_blocks
                 if _f[-1] == 0
             ])
-            _data_blocks: list[tuple[str]] = [("",)]
+            data_blocks: list[tuple[str]] = [("",)]
             # Pages layouts not the same in all reports
-            _pg_hdr_strings = (
+            pg_hdr_strings = (
                 "FEDERAL TRADE COMMISSION",
                 "HORIZONTAL MERGER INVESTIGATION DATA: FISCAL YEARS 1996 - 2011",
             )
-            if len(_doc_pg_blocks) > 4:
-                _tnum: re.match = None
-                for _blk_idx, _pg_blk in enumerate(_doc_pg_blocks):
-                    if _tnum := TABLE_NO_RE.fullmatch(_pg_blk[-3].strip()):
-                        _data_blocks = [
-                            _b
-                            for _b in _doc_pg_blocks
-                            if not _b[-3].startswith(_pg_hdr_strings)
+            if len(doc_pg_blocks) > 4:
+                tnum = None
+                for _pg_blk in doc_pg_blocks:
+                    if tnum := TABLE_NO_RE.fullmatch(_pg_blk[-3].strip()):
+                        data_blocks = [
+                            b_
+                            for b_ in doc_pg_blocks
+                            if not b_[-3].startswith(pg_hdr_strings)
                             and (
-                                _b[-3].strip()
-                                not in ("Significant Competitors", "Post Merger HHI")
+                                b_[-3].strip()
+                                not in {"Significant Competitors", "Post Merger HHI"}
                             )
-                            and not re.fullmatch(r"\d+", _b[-3].strip())
+                            and not re.fullmatch(r"\d+", b_[-3].strip())
                         ]
                         break
-                if not _tnum:
+                if not tnum:
                     continue
-                del _tnum
+                del tnum
             else:
                 continue
-            _parse_page_blocks(_invdata, _data_period, _data_blocks)
+            _parse_page_blocks(invdata, data_period, data_blocks)
-        _invdata_fitz.close()
+        invdata_doc.close()
-    return MappingProxyType(_invdata)
+    return _mappingproxy_from_mapping(invdata)
 def _parse_page_blocks(
-    _invdata: _INVData_in, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
+    _invdata: INVData_in_, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
 ) -> None:
     if _data_period != "1996-2011":
         _parse_table_blocks(_invdata, _data_period, _doc_pg_blocks)
     else:
-        _test_list = [
+        test_list = [
             (g, f[-3].strip())
             for g, f in enumerate(_doc_pg_blocks)
             if TABLE_NO_RE.fullmatch(f[-3].strip())
         ]
         # In the 1996-2011 report, there are 2 tables per page
-        if len(_test_list) == 1:
-            _table_a_blocks = _doc_pg_blocks
-            _table_b_blocks: Sequence[Sequence[Any]] = []
+        if len(test_list) == 1:
+            table_a_blocks = _doc_pg_blocks
+            table_b_blocks: Sequence[Sequence[Any]] = []
         else:
-            _table_a_blocks, _table_b_blocks = (
-                _doc_pg_blocks[_test_list[0][0] : _test_list[1][0]],
-                _doc_pg_blocks[_test_list[1][0] :],
+            table_a_blocks, table_b_blocks = (
+                _doc_pg_blocks[test_list[0][0] : test_list[1][0]],
+                _doc_pg_blocks[test_list[1][0] :],
             )
-        for _table_i_blocks in _table_a_blocks, _table_b_blocks:
-            if not _table_i_blocks:
+        for table_i_blocks in table_a_blocks, table_b_blocks:
+            if not table_i_blocks:
                 continue
-            _parse_table_blocks(_invdata, _data_period, _table_i_blocks)
+            _parse_table_blocks(_invdata, _data_period, table_i_blocks)
 def _parse_table_blocks(
-    _invdata: _INVData_in, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
+    _invdata: INVData_in_, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
 ) -> None:
-    _invdata_evid_cond = "Unrestricted on additional evidence"
-    _table_num, _table_ser, _table_type = _identify_table_type(
+    invdata_evid_cond = "Unrestricted on additional evidence"
+    table_num, table_ser, table_type = _identify_table_type(
         _table_blocks[0][-3].strip()
     )
     if _data_period == "1996-2011":
-        _invdata_ind_group = (
+        invdata_ind_group = (
             _table_blocks[1][-3].split("\n")[1]
-            if _table_num == "Table 4.8"
+            if table_num == "Table 4.8"
             else _table_blocks[2][-3].split("\n")[0]
         )
-        if _table_ser > 4:
-            _invdata_evid_cond = (
+        if table_ser > 4:
+            invdata_evid_cond = (
                 _table_blocks[2][-3].split("\n")[1]
-                if _table_ser in (9, 10)
+                if table_ser in {9, 10}
                 else _table_blocks[3][-3].strip()
             )
     elif _data_period == "1996-2005":
         _table_blocks = sorted(_table_blocks, key=itemgetter(6))
-        _invdata_ind_group = _table_blocks[3][-3].strip()
-        if _table_ser > 4:
-            _invdata_evid_cond = _table_blocks[5][-3].strip()
+        invdata_ind_group = _table_blocks[3][-3].strip()
+        if table_ser > 4:
+            invdata_evid_cond = _table_blocks[5][-3].strip()
-    elif _table_ser % 2 == 0:
-        _invdata_ind_group = _table_blocks[1][-3].split("\n")[2]
-        if (_evid_cond_teststr := _table_blocks[2][-3].strip()) == "Outcome":
-            _invdata_evid_cond = "Unrestricted on additional evidence"
+    elif table_ser % 2 == 0:
+        invdata_ind_group = _table_blocks[1][-3].split("\n")[2]
+        if (evid_cond_teststr := _table_blocks[2][-3].strip()) == "Outcome":
+            invdata_evid_cond = "Unrestricted on additional evidence"
         else:
-            _invdata_evid_cond = _evid_cond_teststr
+            invdata_evid_cond = evid_cond_teststr
     elif _table_blocks[3][-3].startswith("FTC Horizontal Merger Investigations"):
-        _invdata_ind_group = _table_blocks[3][-3].split("\n")[2]
-        _invdata_evid_cond = "Unrestricted on additional evidence"
+        invdata_ind_group = _table_blocks[3][-3].split("\n")[2]
+        invdata_evid_cond = "Unrestricted on additional evidence"
     else:
         # print(_table_blocks)
-        _invdata_evid_cond = (
+        invdata_evid_cond = (
             _table_blocks[1][-3].strip()
-            if _table_ser == 9
+            if table_ser == 9
             else _table_blocks[3][-3].strip()
         )
-        _invdata_ind_group = _table_blocks[4][-3].split("\n")[2]
+        invdata_ind_group = _table_blocks[4][-3].split("\n")[2]
-    if _invdata_ind_group == "Pharmaceutical Markets":
-        _invdata_ind_group = "Pharmaceuticals Markets"
+    if invdata_ind_group == "Pharmaceutical Markets":
+        invdata_ind_group = "Pharmaceuticals Markets"
     process_table_func = (
         _process_table_blks_conc_type
-        if _table_type == TABLE_TYPES[0]
+        if table_type == TABLE_TYPES[0]
         else _process_table_blks_cnt_type
     )
-    _table_array = process_table_func(_table_blocks)
-    if not isinstance(_table_array, np.ndarray) or _table_array.dtype != np.int64:
-        print(_table_num)
+    table_array = process_table_func(_table_blocks)
+    if not isinstance(table_array, np.ndarray) or table_array.dtype != np.uint64:
+        print(table_num)
         print(_table_blocks)
         raise ValueError
-    _table_data = INVTableData(_invdata_ind_group, _invdata_evid_cond, _table_array)
-    _invdata[_data_period][_table_type] |= {_table_num: _table_data}
+    table_data = INVTableData(invdata_ind_group, invdata_evid_cond, table_array)
+    _invdata[_data_period][table_type] |= {table_num: table_data}
 def _identify_table_type(_tnstr: str = CONC_TABLE_ALL, /) -> tuple[str, int, str]:
-    _tnum = _tnstr.split(" ")[1]
-    _tsub = int(_tnum.split(".")[0])
-    return _tnstr, _tsub, TABLE_TYPES[(_tsub + 1) % 2]
+    tnum = _tnstr.split(" ")[1]
+    tsub = int(tnum.split(".")[0])
+    return _tnstr, tsub, TABLE_TYPES[(tsub + 1) % 2]
 def _process_table_blks_conc_type(
     _table_blocks: Sequence[Sequence[str]], /
 ) -> ArrayBIGINT:
-    _conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
-    _col_titles_array = tuple(CONC_DELTA_DICT.values())
-    _col_totals: ArrayBIGINT = np.zeros(len(_col_titles_array), np.int64)
-    _invdata_array: ArrayBIGINT = np.array(None)
-    for _tbl_blk in _table_blocks:
-        if _conc_row_pat.match(_blk_str := _tbl_blk[-3]):
-            _row_list: list[str] = _blk_str.strip().split("\n")
-            _row_title: str = _row_list.pop(0)
-            _row_key: int = CONC_HHI_DICT[_row_title]
-            _row_total = np.array(_row_list.pop().replace(",", "").split("/"), np.int64)
-            _row_array_list: list[list[int]] = []
-            while _row_list:
-                _enfd_val, _clsd_val = _row_list.pop(0).split("/")
-                _row_array_list += [
+    conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
+    col_titles_array = tuple(CONC_DELTA_DICT.values())
+    col_totals: ArrayBIGINT = np.zeros(len(col_titles_array), np.uint64)
+    invdata_array: ArrayBIGINT = np.array(None)
+    for tbl_blk in _table_blocks:
+        if conc_row_pat.match(_blk_str := tbl_blk[-3]):
+            row_list: list[str] = _blk_str.strip().split("\n")
+            row_title: str = row_list.pop(0)
+            row_key: int = CONC_HHI_DICT[row_title]
+            row_total = np.array(row_list.pop().replace(",", "").split("/"), np.uint64)
+            row_array_list: list[list[int]] = []
+            while row_list:
+                enfd_val, clsd_val = row_list.pop(0).split("/")
+                row_array_list += [
                     [
-                        _row_key,
-                        _col_titles_array[len(_row_array_list)],
-                        int(_enfd_val),
-                        int(_clsd_val),
-                        int(_enfd_val) + int(_clsd_val),
+                        row_key,
+                        col_titles_array[len(row_array_list)],
+                        int(enfd_val),
+                        int(clsd_val),
+                        int(enfd_val) + int(clsd_val),
                     ]
                 ]
-            _row_array = np.array(_row_array_list, np.int64)
+            row_array = np.array(row_array_list, np.uint64)
             # Check row totals
-            assert_array_equal(_row_total, np.einsum("ij->j", _row_array[:, 2:4]))
+            assert_array_equal(row_total, np.einsum("ij->j", row_array[:, 2:4]))
-            if _row_key == TTL_KEY:
-                _col_totals = _row_array
+            if row_key == TTL_KEY:
+                col_totals = row_array
             else:
-                _invdata_array = (
-                    np.vstack((_invdata_array, _row_array))
-                    if _invdata_array.shape
-                    else _row_array
+                invdata_array = (
+                    np.vstack((invdata_array, row_array))
+                    if invdata_array.shape
+                    else row_array
                 )
-            del _row_array, _row_array_list
+            del row_array, row_array_list
         else:
             continue
     # Check column totals
-    for _col_tot in _col_totals:
+    for _col_tot in col_totals:
         assert_array_equal(
             _col_tot[2:],
             np.einsum(
-                "ij->j", _invdata_array[_invdata_array[:, 1] == _col_tot[1]][:, 2:]
+                "ij->j", invdata_array[invdata_array[:, 1] == _col_tot[1]][:, 2:]
             ),
         )
-    return _invdata_array[
-        np.argsort(np.einsum("ij,ij->i", [[100, 1]], _invdata_array[:, :2]))
+    return invdata_array[
+        np.argsort(np.einsum("ij,ij->i", [[100, 1]], invdata_array[:, :2]))
     ]
 def _process_table_blks_cnt_type(
     _table_blocks: Sequence[Sequence[str]], /
 ) -> ArrayBIGINT:
-    _cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
+    cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
-    _invdata_array: ArrayBIGINT = np.array(None)
-    _col_totals: ArrayBIGINT = np.zeros(3, np.int64)  # "enforced", "closed", "total"
+    invdata_array: ArrayBIGINT = np.array(None)
+    col_totals: ArrayBIGINT = np.zeros(3, np.uint64)  # "enforced", "closed", "total"
     for _tbl_blk in _table_blocks:
-        if _cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
-            _row_list_s = _blk_str.strip().replace(",", "").split("\n")
-            _row_list = np.array(
-                [CNT_FCOUNT_DICT[_row_list_s[0]], *_row_list_s[1:]], np.int64
+        if cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
+            row_list_s = _blk_str.strip().replace(",", "").split("\n")
+            row_list = np.array(
+                [CNT_FCOUNT_DICT[row_list_s[0]], *row_list_s[1:]], np.uint64
             )
-            del _row_list_s
-            if _row_list[3] != _row_list[1] + _row_list[2]:
+            del row_list_s
+            if row_list[3] != row_list[1] + row_list[2]:
                 raise ValueError(
                     "Total number of investigations does not equal #enforced plus #closed."
                 )
-            if _row_list[0] == TTL_KEY:
-                _col_totals = _row_list
+            if row_list[0] == TTL_KEY:
+                col_totals = row_list
             else:
-                _invdata_array = (
-                    np.vstack((_invdata_array, _row_list))
-                    if _invdata_array.shape
-                    else _row_list
+                invdata_array = (
+                    np.vstack((invdata_array, row_list))
+                    if invdata_array.shape
+                    else row_list
                 )
         else:
             continue
     if not np.array_equal(
-        np.array([int(f) for f in _col_totals[1:]], np.int64),
-        np.einsum("ij->j", _invdata_array[:, 1:]),
+        np.array(list(col_totals[1:]), np.uint64),
+        np.einsum("ij->j", invdata_array[:, 1:]),
     ):
         raise ValueError("Column totals don't compute.")
-    return _invdata_array[np.argsort(_invdata_array[:, 0])]
+    return invdata_array[np.argsort(invdata_array[:, 0])]
 def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
     if not _dl_path.is_dir():
         _dl_path.mkdir(parents=True)
-    _invdata_homepage_urls = (
+    invdata_homepage_urls = (
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
     )
-    _invdata_docnames = (
+    invdata_docnames = (
         "040831horizmergersdata96-03.pdf",
         "p035603horizmergerinvestigationdata1996-2005.pdf",
         "081201hsrmergerdata.pdf",
@@ -734,40 +727,40 @@ def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
     )
     if all(
-        _dl_path.joinpath(_invdata_docname).is_file()
-        for _invdata_docname in _invdata_docnames
+        _dl_path.joinpath(invdata_docname).is_file()
+        for invdata_docname in invdata_docnames
     ):
-        return _invdata_docnames
-    _invdata_docnames_dl: tuple[str, ...] = ()
-    _u3pm = urllib3.PoolManager()
-    _chunk_size = 1024 * 1024
-    for _invdata_homepage_url in _invdata_homepage_urls:
-        with _u3pm.request(
-            "GET", _invdata_homepage_url, preload_content=False
+        return invdata_docnames
+    invdata_docnames_dl: tuple[str, ...] = ()
+    u3pm = urllib3.PoolManager()
+    chunk_size_ = 1024 * 1024
+    for invdata_homepage_url in invdata_homepage_urls:
+        with u3pm.request(
+            "GET", invdata_homepage_url, preload_content=False
         ) as _u3handle:
-            _invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
-            _invdata_attrs = [
+            invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
+            invdata_attrs = [
                 (_g.get("title", ""), _g.get("href", ""))
-                for _g in _invdata_soup.find_all("a")
+                for _g in invdata_soup.find_all("a")
                 if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
             ]
-        for _invdata_attr in _invdata_attrs:
-            _invdata_docname, _invdata_link = _invdata_attr
-            _invdata_docnames_dl += (_invdata_docname,)
+        for invdata_attr in invdata_attrs:
+            invdata_docname, invdata_link = invdata_attr
+            invdata_docnames_dl += (invdata_docname,)
             with (
-                _u3pm.request(
-                    "GET", f"https://www.ftc.gov/{_invdata_link}", preload_content=False
+                u3pm.request(
+                    "GET", f"https://www.ftc.gov/{invdata_link}", preload_content=False
                 ) as _urlopen_handle,
-                _dl_path.joinpath(_invdata_docname).open("wb") as _invdata_fh,
+                _dl_path.joinpath(invdata_docname).open("wb") as invdata_fh,
             ):
                 while True:
-                    _data = _urlopen_handle.read(_chunk_size)
-                    if not _data:
+                    data = _urlopen_handle.read(chunk_size_)
+                    if not data:
                         break
-                    _invdata_fh.write(_data)
+                    invdata_fh.write(data)
-    return _invdata_docnames_dl
+    return invdata_docnames_dl
 if __name__ == "__main__":

mergeron 2025.739290.3__py3-none-any.whl → 2025.739290.4__py3-none-any.whl

Potentially problematic release.

mergeron 2025.739290.3py3-none-any.whl → 2025.739290.4py3-none-any.whl