PyPI - mergeron - Versions diffs - 2025.739439.15__py3-none-any.whl → 2025.739439.20__py3-none-any.whl - Mend

mergeron 2025.739439.15py3-none-any.whl → 2025.739439.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mergeron might be problematic. Click here for more details.

Files changed (9) hide show

mergeron/__init__.py CHANGED Viewed

@@ -15,7 +15,7 @@ from ruamel import yaml
 _PKG_NAME: str = Path(__file__).parent.name
-VERSION = "2025.739439.15"
+VERSION = "2025.739439.20"
 __version__ = VERSION

mergeron/core/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import shutil
 from collections.abc import Mapping
 from decimal import Decimal
 from types import MappingProxyType
@@ -20,6 +21,8 @@ from .. import (  # noqa: TID252
     yamelize_attrs,
     yaml_rt_mapper,
 )
+from .. import WORK_DIR as PKG_WORK_DIR  # noqa: TID252
+from .. import data as mdat  # noqa: TID252
 __version__ = VERSION
@@ -40,6 +43,64 @@ class GuidelinesBoundary:
     """Area under the boundary."""
+WORK_DIR = globals().get("WORK_DIR", PKG_WORK_DIR)
+"""Redefined, in case the user defines WORK_DIR between module imports."""
+FID_WORK_DIR = WORK_DIR / "FTCData"
+if not FID_WORK_DIR.is_dir():
+    FID_WORK_DIR.mkdir(parents=True)
+INVDATA_ARCHIVE_PATH = WORK_DIR / mdat.FTC_MERGER_INVESTIGATIONS_DATA.name
+if not INVDATA_ARCHIVE_PATH.is_file():
+    shutil.copy2(mdat.FTC_MERGER_INVESTIGATIONS_DATA, INVDATA_ARCHIVE_PATH)  # type: ignore
+TABLE_TYPES = ("ByHHIandDelta", "ByFirmCount")
+CONC_TABLE_ALL = "Table 3.1"
+CNT_TABLE_ALL = "Table 4.1"
+TTL_KEY = 86825
+CONC_HHI_DICT = {
+    "0 - 1,799": 0,
+    "1,800 - 1,999": 1800,
+    "2,000 - 2,399": 2000,
+    "2,400 - 2,999": 2400,
+    "3,000 - 3,999": 3000,
+    "4,000 - 4,999": 4000,
+    "5,000 - 6,999": 5000,
+    "7,000 - 10,000": 7000,
+    "TOTAL": TTL_KEY,
+}
+CONC_DELTA_DICT = {
+    "0 - 100": 0,
+    "100 - 200": 100,
+    "200 - 300": 200,
+    "300 - 500": 300,
+    "500 - 800": 500,
+    "800 - 1,200": 800,
+    "1,200 - 2,500": 1200,
+    "2,500 - 5,000": 2500,
+    "TOTAL": TTL_KEY,
+}
+CNT_FCOUNT_DICT = {
+    "2 to 1": 2,
+    "3 to 2": 3,
+    "4 to 3": 4,
+    "5 to 4": 5,
+    "6 to 5": 6,
+    "7 to 6": 7,
+    "8 to 7": 8,
+    "9 to 8": 9,
+    "10 to 9": 10,
+    "10 +": 11,
+    "TOTAL": TTL_KEY,
+}
+def invert_map(_dict: Mapping[Any, Any]) -> Mapping[Any, Any]:
+    """Invert mapping, mapping values to keys of the original mapping."""
+    return {_v: _k for _k, _v in _dict.items()}
 @frozen
 class INVTableData:
     """Represents individual table of FTC merger investigations data."""

mergeron/core/_process_ftc_merger_investigations_data.py ADDED Viewed

@@ -0,0 +1,379 @@
+"""Download and parse FTC Merger Investigations Data.
+This module provided as documentation only. The package
+:code:`pymupdf` is a requirement of this module but is
+distributed under a license that  may be incompatible with
+the MIT license under which this package is distributed.
+"""
+import re
+from collections.abc import Sequence
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+import numpy as np
+# import pymupdf  # type: ignore
+import urllib3
+from bs4 import BeautifulSoup
+from numpy.testing import assert_array_equal
+from .. import ArrayBIGINT  # noqa: TID252
+from . import (
+    CNT_FCOUNT_DICT,
+    CONC_DELTA_DICT,
+    CONC_HHI_DICT,
+    CONC_TABLE_ALL,
+    FID_WORK_DIR,
+    TABLE_TYPES,
+    TTL_KEY,
+    INVData,
+    INVData_in,
+    INVTableData,
+    _mappingproxy_from_mapping,
+)
+TABLE_NO_RE = re.compile(r"Table \d+\.\d+")
+def _parse_invdata() -> INVData:
+    """Parse FTC merger investigations data reports to structured data.
+    Returns
+    -------
+        Immutable dictionary of merger investigations data, keyed to
+        reporting period, and including all tables organized by
+        Firm Count (number of remaining competitors) and
+        by range of HHI and ∆HHI.
+    """
+    raise ValueError(
+        "This function is defined here as documentation.\n"
+        "NOTE: License for `pymupdf`, upon which this function depends,"
+        " may be incompatible with the MIT license,"
+        " under which this pacakge is distributed."
+        " Making this fumction operable requires the user to modify"
+        " the source code as well as to install an additional package"
+        " not distributed with this package or identified as a requirement."
+    )
+    invdata_docnames = _download_invdata(FID_WORK_DIR)
+    invdata: INVData_in = {}
+    for invdata_docname in invdata_docnames:
+        invdata_pdf_path = FID_WORK_DIR.joinpath(invdata_docname)
+        invdata_doc = pymupdf.open(invdata_pdf_path)  # type: ignore  # noqa: F821
+        invdata_meta = invdata_doc.metadata
+        if invdata_meta["title"] == " ":
+            invdata_meta["title"] = ", ".join((
+                "Horizontal Merger Investigation Data",
+                "Fiscal Years",
+                "1996-2005",
+            ))
+        data_period = "".join(  # line-break here for readability
+            re.findall(r"(\d{4}) *(-) *(\d{4})", invdata_meta["title"])[0]
+        )
+        # Initialize containers for parsed data
+        invdata[data_period] = {k: {} for k in TABLE_TYPES}
+        for pdf_pg in invdata_doc.pages():
+            doc_pg_blocks = pdf_pg.get_text("blocks", sort=False)
+            # Across all published reports of FTC investigations data,
+            #   sorting lines (PDF page blocks) by the lower coordinates
+            #   and then the left coordinates is most effective for
+            #   ordering table rows in top-to-bottom order; this doesn't
+            #   work for the 1996-2005 data, however, so we resort later
+            doc_pg_blocks = sorted([
+                (f"{_f[3]:03.0f}{_f[0]:03.0f}{_f[1]:03.0f}{_f[2]:03.0f}", *_f)
+                for _f in doc_pg_blocks
+                if _f[-1] == 0
+            ])
+            data_blocks: list[tuple[str]] = [("",)]
+            # Pages layouts not the same in all reports
+            pg_hdr_strings = (
+                "FEDERAL TRADE COMMISSION",
+                "HORIZONTAL MERGER INVESTIGATION DATA: FISCAL YEARS 1996 - 2011",
+            )
+            if len(doc_pg_blocks) > 4:
+                tnum = None
+                for _pg_blk in doc_pg_blocks:
+                    if tnum := TABLE_NO_RE.fullmatch(_pg_blk[-3].strip()):
+                        data_blocks = [
+                            b_
+                            for b_ in doc_pg_blocks
+                            if not b_[-3].startswith(pg_hdr_strings)
+                            and (
+                                b_[-3].strip()
+                                not in {"Significant Competitors", "Post Merger HHI"}
+                            )
+                            and not re.fullmatch(r"\d+", b_[-3].strip())
+                        ]
+                        break
+                if not tnum:
+                    continue
+                del tnum
+            else:
+                continue
+            _parse_page_blocks(invdata, data_period, data_blocks)
+        invdata_doc.close()
+    return _mappingproxy_from_mapping(invdata)
+def _parse_page_blocks(
+    _invdata: INVData_in, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
+) -> None:
+    if _data_period != "1996-2011":
+        _parse_table_blocks(_invdata, _data_period, _doc_pg_blocks)
+    else:
+        test_list = [
+            (g, f[-3].strip())
+            for g, f in enumerate(_doc_pg_blocks)
+            if TABLE_NO_RE.fullmatch(f[-3].strip())
+        ]
+        # In the 1996-2011 report, there are 2 tables per page
+        if len(test_list) == 1:
+            table_a_blocks = _doc_pg_blocks
+            table_b_blocks: Sequence[Sequence[Any]] = []
+        else:
+            table_a_blocks, table_b_blocks = (
+                _doc_pg_blocks[test_list[0][0] : test_list[1][0]],
+                _doc_pg_blocks[test_list[1][0] :],
+            )
+        for table_i_blocks in table_a_blocks, table_b_blocks:
+            if not table_i_blocks:
+                continue
+            _parse_table_blocks(_invdata, _data_period, table_i_blocks)
+def _parse_table_blocks(
+    _invdata: INVData_in, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
+) -> None:
+    invdata_evid_cond = "Unrestricted on additional evidence"
+    table_num, table_ser, table_type = _identify_table_type(
+        _table_blocks[0][-3].strip()
+    )
+    if _data_period == "1996-2011":
+        invdata_ind_group = (
+            _table_blocks[1][-3].split("\n")[1]
+            if table_num == "Table 4.8"
+            else _table_blocks[2][-3].split("\n", maxsplit=1)[0]
+        )
+        if table_ser > 4:
+            invdata_evid_cond = (
+                _table_blocks[2][-3].split("\n")[1]
+                if table_ser in {9, 10}
+                else _table_blocks[3][-3].strip()
+            )
+    elif _data_period == "1996-2005":
+        _table_blocks = sorted(_table_blocks, key=itemgetter(6))
+        invdata_ind_group = _table_blocks[3][-3].strip()
+        if table_ser > 4:
+            invdata_evid_cond = _table_blocks[5][-3].strip()
+    elif table_ser % 2 == 0:
+        invdata_ind_group = _table_blocks[1][-3].split("\n")[2]
+        if (evid_cond_teststr := _table_blocks[2][-3].strip()) == "Outcome":
+            invdata_evid_cond = "Unrestricted on additional evidence"
+        else:
+            invdata_evid_cond = evid_cond_teststr
+    elif _table_blocks[3][-3].startswith("FTC Horizontal Merger Investigations"):
+        invdata_ind_group = _table_blocks[3][-3].split("\n")[2]
+        invdata_evid_cond = "Unrestricted on additional evidence"
+    else:
+        # print(_table_blocks)
+        invdata_evid_cond = (
+            _table_blocks[1][-3].strip()
+            if table_ser == 9
+            else _table_blocks[3][-3].strip()
+        )
+        invdata_ind_group = _table_blocks[4][-3].split("\n")[2]
+    if invdata_ind_group == "Pharmaceutical Markets":
+        invdata_ind_group = "Pharmaceuticals Markets"
+    process_table_func = (
+        _process_table_blks_conc_type
+        if table_type == TABLE_TYPES[0]
+        else _process_table_blks_cnt_type
+    )
+    table_array = process_table_func(_table_blocks)
+    if not isinstance(table_array, np.ndarray) or table_array.dtype != int:
+        print(table_num)
+        print(_table_blocks)
+        raise ValueError
+    table_data = INVTableData(invdata_ind_group, invdata_evid_cond, table_array)
+    _invdata[_data_period][table_type] |= {table_num: table_data}
+def _identify_table_type(_tnstr: str = CONC_TABLE_ALL, /) -> tuple[str, int, str]:
+    tnum = _tnstr.split(" ")[1]
+    tsub = int(tnum.split(".")[0])
+    return _tnstr, tsub, TABLE_TYPES[(tsub + 1) % 2]
+def _process_table_blks_conc_type(
+    _table_blocks: Sequence[Sequence[str]], /
+) -> ArrayBIGINT:
+    conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
+    col_titles = tuple(CONC_DELTA_DICT.values())
+    col_totals: ArrayBIGINT = np.zeros(len(col_titles), int)
+    invdata_array: ArrayBIGINT = np.array(None)
+    for tbl_blk in _table_blocks:
+        if conc_row_pat.match(_blk_str := tbl_blk[-3]):
+            row_list: list[str] = _blk_str.strip().split("\n")
+            row_title: str = row_list.pop(0)
+            row_key: int = (
+                7000 if row_title.startswith("7,000") else CONC_HHI_DICT[row_title]
+            )
+            row_total = np.array(row_list.pop().replace(",", "").split("/"), int)
+            data_row_list: list[list[int]] = []
+            while row_list:
+                enfd_val, clsd_val = row_list.pop(0).split("/")
+                data_row_list += [
+                    [
+                        row_key,
+                        col_titles[len(data_row_list)],
+                        int(enfd_val),
+                        int(clsd_val),
+                        int(enfd_val) + int(clsd_val),
+                    ]
+                ]
+            data_row_array = np.array(data_row_list, int)
+            del data_row_list
+            # Check row totals
+            assert_array_equal(row_total, np.einsum("ij->j", data_row_array[:, 2:4]))
+            if row_key == TTL_KEY:
+                col_totals = data_row_array
+            else:
+                invdata_array = (
+                    np.vstack((invdata_array, data_row_array))
+                    if invdata_array.shape
+                    else data_row_array
+                )
+            del data_row_array
+        else:
+            continue
+    # Check column totals
+    for _col_tot in col_totals:
+        assert_array_equal(
+            _col_tot[2:],
+            np.einsum(
+                "ij->j", invdata_array[invdata_array[:, 1] == _col_tot[1]][:, 2:]
+            ),
+        )
+    return invdata_array[
+        np.argsort(np.einsum("ij,ij->i", [[100, 1]], invdata_array[:, :2]))
+    ]
+def _process_table_blks_cnt_type(
+    _table_blocks: Sequence[Sequence[str]], /
+) -> ArrayBIGINT:
+    cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
+    invdata_array: ArrayBIGINT = np.array(None)
+    col_totals: ArrayBIGINT = np.zeros(3, int)  # "enforced", "closed", "total"
+    for _tbl_blk in _table_blocks:
+        if cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
+            row_list_s = _blk_str.strip().replace(",", "").split("\n")
+            row_list = np.array([CNT_FCOUNT_DICT[row_list_s[0]], *row_list_s[1:]], int)
+            del row_list_s
+            if row_list[3] != row_list[1] + row_list[2]:
+                raise ValueError(
+                    "Total number of investigations does not equal #enforced plus #closed."
+                )
+            if row_list[0] == TTL_KEY:
+                col_totals = row_list
+            else:
+                invdata_array = (
+                    np.vstack((invdata_array, row_list))
+                    if invdata_array.shape
+                    else row_list
+                )
+        else:
+            continue
+    if not np.array_equal(
+        np.array(list(col_totals[1:]), int), np.einsum("ij->j", invdata_array[:, 1:])
+    ):
+        raise ValueError("Column totals don't compute.")
+    return invdata_array[np.argsort(invdata_array[:, 0])]
+def _download_invdata(_dl_path: Path = FID_WORK_DIR) -> tuple[str, ...]:
+    if not _dl_path.is_dir():
+        _dl_path.mkdir(parents=True)
+    invdata_homepage_urls = (
+        "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
+        "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
+        "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
+        "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
+    )
+    invdata_docnames = (
+        "040831horizmergersdata96-03.pdf",
+        "p035603horizmergerinvestigationdata1996-2005.pdf",
+        "081201hsrmergerdata.pdf",
+        "130104horizontalmergerreport.pdf",
+    )
+    if all(
+        _dl_path.joinpath(invdata_docname).is_file()
+        for invdata_docname in invdata_docnames
+    ):
+        return invdata_docnames
+    invdata_docnames_dl: tuple[str, ...] = ()
+    u3pm = urllib3.PoolManager()
+    chunk_size_ = 1024 * 1024
+    for invdata_homepage_url in invdata_homepage_urls:
+        with u3pm.request(
+            "GET", invdata_homepage_url, preload_content=False
+        ) as _u3handle:
+            invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
+            invdata_attrs = [
+                (_g.get("title", ""), _g.get("href", ""))
+                for _g in invdata_soup.find_all("a")
+                if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
+            ]
+        for invdata_attr in invdata_attrs:
+            invdata_docname, invdata_link = invdata_attr
+            invdata_docnames_dl += (invdata_docname,)
+            with (
+                u3pm.request(
+                    "GET", f"https://www.ftc.gov/{invdata_link}", preload_content=False
+                ) as _urlopen_handle,
+                _dl_path.joinpath(invdata_docname).open("wb") as invdata_fh,
+            ):
+                while True:
+                    data = _urlopen_handle.read(chunk_size_)
+                    if not data:
+                        break
+                    invdata_fh.write(data)
+    return invdata_docnames_dl

mergeron/core/ftc_merger_investigations_data.py CHANGED Viewed

@@ -1,5 +1,8 @@
-"""
-Methods to parse FTC Merger Investigations Data, downloading source documents as needed.
+"""Methods to load and augmentFTC Merger Investigations Data.
+Details on downloading and processing the data are specified in
+the "private" module, :code:`_process_ftc_merger_investigations_data`.
 Notes
 -----
@@ -10,92 +13,31 @@ Reported row and column totals from source data are not stored.
 from __future__ import annotations
 import re
-import shutil
-from collections.abc import Mapping, Sequence
-from operator import itemgetter
+from collections.abc import Sequence
 from pathlib import Path
 from types import MappingProxyType
-from typing import Any
 from zipfile import ZIP_DEFLATED, ZipFile
 import numpy as np
-import urllib3
-from bs4 import BeautifulSoup
-from numpy.testing import assert_array_equal
-from .. import EMPTY_ARRAYINT, VERSION, ArrayBIGINT, this_yaml  # noqa: TID252
-from .. import WORK_DIR as PKG_WORK_DIR  # noqa: TID252
-from .. import data as mdat  # noqa: TID252
+from .. import EMPTY_ARRAYINT, VERSION, this_yaml  # noqa: TID252
 from . import (
+    CNT_TABLE_ALL,
+    CONC_TABLE_ALL,
+    INVDATA_ARCHIVE_PATH,
+    TABLE_TYPES,
     INVData,
     INVData_in,
     INVTableData,
     _dict_from_mapping,
     _mappingproxy_from_mapping,
 )
+from ._process_ftc_merger_investigations_data import _parse_invdata
 __version__ = VERSION
 # cspell: "includeRegExpList": ["strings", "comments", /( {3}['"]{3}).*?\\1/g]
-WORK_DIR = globals().get("WORK_DIR", PKG_WORK_DIR)
-"""Redefined, in case the user defines WORK_DIR betweeen module imports."""
-FID_WORK_DIR = WORK_DIR / "FTCData"
-if not FID_WORK_DIR.is_dir():
-    FID_WORK_DIR.mkdir(parents=True)
-INVDATA_ARCHIVE_PATH = WORK_DIR / mdat.FTC_MERGER_INVESTIGATIONS_DATA.name
-if not INVDATA_ARCHIVE_PATH.is_file():
-    shutil.copy2(mdat.FTC_MERGER_INVESTIGATIONS_DATA, INVDATA_ARCHIVE_PATH)  # type: ignore
-TABLE_NO_RE = re.compile(r"Table \d+\.\d+")
-TABLE_TYPES = ("ByHHIandDelta", "ByFirmCount")
-CONC_TABLE_ALL = "Table 3.1"
-CNT_TABLE_ALL = "Table 4.1"
-TTL_KEY = 86825
-CONC_HHI_DICT = {
-    "0 - 1,799": 0,
-    "1,800 - 1,999": 1800,
-    "2,000 - 2,399": 2000,
-    "2,400 - 2,999": 2400,
-    "3,000 - 3,999": 3000,
-    "4,000 - 4,999": 4000,
-    "5,000 - 6,999": 5000,
-    "7,000 - 10,000": 7000,
-    "TOTAL": TTL_KEY,
-}
-CONC_DELTA_DICT = {
-    "0 - 100": 0,
-    "100 - 200": 100,
-    "200 - 300": 200,
-    "300 - 500": 300,
-    "500 - 800": 500,
-    "800 - 1,200": 800,
-    "1,200 - 2,500": 1200,
-    "2,500 - 5,000": 2500,
-    "TOTAL": TTL_KEY,
-}
-CNT_FCOUNT_DICT = {
-    "2 to 1": 2,
-    "3 to 2": 3,
-    "4 to 3": 4,
-    "5 to 4": 5,
-    "6 to 5": 6,
-    "7 to 6": 7,
-    "8 to 7": 8,
-    "9 to 8": 9,
-    "10 to 9": 10,
-    "10 +": 11,
-    "TOTAL": TTL_KEY,
-}
-def invert_map(_dict: Mapping[Any, Any]) -> Mapping[Any, Any]:
-    """Invert mapping, mapping values to keys of the original mapping."""
-    return {_v: _k for _k, _v in _dict.items()}
 def construct_data(
     _archive_path: Path = INVDATA_ARCHIVE_PATH,
@@ -268,7 +210,7 @@ def _construct_new_period_data(
     invdata_cuml = _invdata[cuml_period]
-    base_period = "1996-{}".format(int(_data_period.split("-")[0]) - 1)
+    base_period = "1996-{}".format(int(_data_period.split("-", maxsplit=1)[0]) - 1)
     invdata_base = _invdata[base_period]
     if tuple(invdata_cuml.keys()) != TABLE_TYPES:
@@ -326,7 +268,9 @@ def _construct_new_period_data(
                                 table_no
                             ].data_array[:, -3:-1]
                         ]
-                    if pd_start == 1996 and pd_end < int(_data_period.split("-")[0]):
+                    if pd_start == 1996 and pd_end < int(
+                        _data_period.split("-", maxsplit=1)[0]
+                    ):
                         invdata_base_array_stack += [
                             _invdata[data_period_detail][table_type][
                                 table_no
@@ -414,350 +358,6 @@ def invdata_build_aggregate_table(
     )
-def _parse_invdata() -> INVData:
-    """Parse FTC merger investigations data reports to structured data.
-    Returns
-    -------
-        Immutable dictionary of merger investigations data, keyed to
-        reporting period, and including all tables organized by
-        Firm Count (number of remaining competitors) and
-        by range of HHI and ∆HHI.
-    """
-    raise ValueError(
-        "This function is defined here as documentation.\n"
-        "NOTE: License for `pymupdf`, upon which this function depends,"
-        " may be incompatible with the MIT license,"
-        " under which this pacakge is distributed."
-        " Making this fumction operable requires the user to modify"
-        " the source code as well as to install an additional package"
-        " not distributed with this package or identified as a requirement."
-    )
-    import pymupdf  # type: ignore
-    invdata_docnames = _download_invdata(FID_WORK_DIR)
-    invdata: INVData_in = {}
-    for invdata_docname in invdata_docnames:
-        invdata_pdf_path = FID_WORK_DIR.joinpath(invdata_docname)
-        invdata_doc = pymupdf.open(invdata_pdf_path)
-        invdata_meta = invdata_doc.metadata
-        if invdata_meta["title"] == " ":
-            invdata_meta["title"] = ", ".join((
-                "Horizontal Merger Investigation Data",
-                "Fiscal Years",
-                "1996-2005",
-            ))
-        data_period = "".join(  # line-break here for readability
-            re.findall(r"(\d{4}) *(-) *(\d{4})", invdata_meta["title"])[0]
-        )
-        # Initialize containers for parsed data
-        invdata[data_period] = {k: {} for k in TABLE_TYPES}
-        for pdf_pg in invdata_doc.pages():
-            doc_pg_blocks = pdf_pg.get_text("blocks", sort=False)
-            # Across all published reports of FTC investigations data,
-            #   sorting lines (PDF page blocks) by the lower coordinates
-            #   and then the left coordinates is most effective for
-            #   ordering table rows in top-to-bottom order; this doesn't
-            #   work for the 1996-2005 data, however, so we resort later
-            doc_pg_blocks = sorted([
-                (f"{_f[3]:03.0f}{_f[0]:03.0f}{_f[1]:03.0f}{_f[2]:03.0f}", *_f)
-                for _f in doc_pg_blocks
-                if _f[-1] == 0
-            ])
-            data_blocks: list[tuple[str]] = [("",)]
-            # Pages layouts not the same in all reports
-            pg_hdr_strings = (
-                "FEDERAL TRADE COMMISSION",
-                "HORIZONTAL MERGER INVESTIGATION DATA: FISCAL YEARS 1996 - 2011",
-            )
-            if len(doc_pg_blocks) > 4:
-                tnum = None
-                for _pg_blk in doc_pg_blocks:
-                    if tnum := TABLE_NO_RE.fullmatch(_pg_blk[-3].strip()):
-                        data_blocks = [
-                            b_
-                            for b_ in doc_pg_blocks
-                            if not b_[-3].startswith(pg_hdr_strings)
-                            and (
-                                b_[-3].strip()
-                                not in {"Significant Competitors", "Post Merger HHI"}
-                            )
-                            and not re.fullmatch(r"\d+", b_[-3].strip())
-                        ]
-                        break
-                if not tnum:
-                    continue
-                del tnum
-            else:
-                continue
-            _parse_page_blocks(invdata, data_period, data_blocks)
-        invdata_doc.close()
-    return _mappingproxy_from_mapping(invdata)
-def _parse_page_blocks(
-    _invdata: INVData_in, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
-) -> None:
-    if _data_period != "1996-2011":
-        _parse_table_blocks(_invdata, _data_period, _doc_pg_blocks)
-    else:
-        test_list = [
-            (g, f[-3].strip())
-            for g, f in enumerate(_doc_pg_blocks)
-            if TABLE_NO_RE.fullmatch(f[-3].strip())
-        ]
-        # In the 1996-2011 report, there are 2 tables per page
-        if len(test_list) == 1:
-            table_a_blocks = _doc_pg_blocks
-            table_b_blocks: Sequence[Sequence[Any]] = []
-        else:
-            table_a_blocks, table_b_blocks = (
-                _doc_pg_blocks[test_list[0][0] : test_list[1][0]],
-                _doc_pg_blocks[test_list[1][0] :],
-            )
-        for table_i_blocks in table_a_blocks, table_b_blocks:
-            if not table_i_blocks:
-                continue
-            _parse_table_blocks(_invdata, _data_period, table_i_blocks)
-def _parse_table_blocks(
-    _invdata: INVData_in, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
-) -> None:
-    invdata_evid_cond = "Unrestricted on additional evidence"
-    table_num, table_ser, table_type = _identify_table_type(
-        _table_blocks[0][-3].strip()
-    )
-    if _data_period == "1996-2011":
-        invdata_ind_group = (
-            _table_blocks[1][-3].split("\n")[1]
-            if table_num == "Table 4.8"
-            else _table_blocks[2][-3].split("\n")[0]
-        )
-        if table_ser > 4:
-            invdata_evid_cond = (
-                _table_blocks[2][-3].split("\n")[1]
-                if table_ser in {9, 10}
-                else _table_blocks[3][-3].strip()
-            )
-    elif _data_period == "1996-2005":
-        _table_blocks = sorted(_table_blocks, key=itemgetter(6))
-        invdata_ind_group = _table_blocks[3][-3].strip()
-        if table_ser > 4:
-            invdata_evid_cond = _table_blocks[5][-3].strip()
-    elif table_ser % 2 == 0:
-        invdata_ind_group = _table_blocks[1][-3].split("\n")[2]
-        if (evid_cond_teststr := _table_blocks[2][-3].strip()) == "Outcome":
-            invdata_evid_cond = "Unrestricted on additional evidence"
-        else:
-            invdata_evid_cond = evid_cond_teststr
-    elif _table_blocks[3][-3].startswith("FTC Horizontal Merger Investigations"):
-        invdata_ind_group = _table_blocks[3][-3].split("\n")[2]
-        invdata_evid_cond = "Unrestricted on additional evidence"
-    else:
-        # print(_table_blocks)
-        invdata_evid_cond = (
-            _table_blocks[1][-3].strip()
-            if table_ser == 9
-            else _table_blocks[3][-3].strip()
-        )
-        invdata_ind_group = _table_blocks[4][-3].split("\n")[2]
-    if invdata_ind_group == "Pharmaceutical Markets":
-        invdata_ind_group = "Pharmaceuticals Markets"
-    process_table_func = (
-        _process_table_blks_conc_type
-        if table_type == TABLE_TYPES[0]
-        else _process_table_blks_cnt_type
-    )
-    table_array = process_table_func(_table_blocks)
-    if not isinstance(table_array, np.ndarray) or table_array.dtype != int:
-        print(table_num)
-        print(_table_blocks)
-        raise ValueError
-    table_data = INVTableData(invdata_ind_group, invdata_evid_cond, table_array)
-    _invdata[_data_period][table_type] |= {table_num: table_data}
-def _identify_table_type(_tnstr: str = CONC_TABLE_ALL, /) -> tuple[str, int, str]:
-    tnum = _tnstr.split(" ")[1]
-    tsub = int(tnum.split(".")[0])
-    return _tnstr, tsub, TABLE_TYPES[(tsub + 1) % 2]
-def _process_table_blks_conc_type(
-    _table_blocks: Sequence[Sequence[str]], /
-) -> ArrayBIGINT:
-    conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
-    col_titles = tuple(CONC_DELTA_DICT.values())
-    col_totals: ArrayBIGINT = np.zeros(len(col_titles), int)
-    invdata_array: ArrayBIGINT = np.array(None)
-    for tbl_blk in _table_blocks:
-        if conc_row_pat.match(_blk_str := tbl_blk[-3]):
-            row_list: list[str] = _blk_str.strip().split("\n")
-            row_title: str = row_list.pop(0)
-            row_key: int = (
-                7000 if row_title.startswith("7,000") else CONC_HHI_DICT[row_title]
-            )
-            row_total = np.array(row_list.pop().replace(",", "").split("/"), int)
-            data_row_list: list[list[int]] = []
-            while row_list:
-                enfd_val, clsd_val = row_list.pop(0).split("/")
-                data_row_list += [
-                    [
-                        row_key,
-                        col_titles[len(data_row_list)],
-                        int(enfd_val),
-                        int(clsd_val),
-                        int(enfd_val) + int(clsd_val),
-                    ]
-                ]
-            data_row_array = np.array(data_row_list, int)
-            del data_row_list
-            # Check row totals
-            assert_array_equal(row_total, np.einsum("ij->j", data_row_array[:, 2:4]))
-            if row_key == TTL_KEY:
-                col_totals = data_row_array
-            else:
-                invdata_array = (
-                    np.vstack((invdata_array, data_row_array))
-                    if invdata_array.shape
-                    else data_row_array
-                )
-            del data_row_array
-        else:
-            continue
-    # Check column totals
-    for _col_tot in col_totals:
-        assert_array_equal(
-            _col_tot[2:],  # type: ignore
-            np.einsum(
-                "ij->j",
-                invdata_array[invdata_array[:, 1] == _col_tot[1]][:, 2:],  # type: ignore
-            ),
-        )
-    return invdata_array[
-        np.argsort(np.einsum("ij,ij->i", [[100, 1]], invdata_array[:, :2]))
-    ]
-def _process_table_blks_cnt_type(
-    _table_blocks: Sequence[Sequence[str]], /
-) -> ArrayBIGINT:
-    cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
-    invdata_array: ArrayBIGINT = np.array(None)
-    col_totals: ArrayBIGINT = np.zeros(3, int)  # "enforced", "closed", "total"
-    for _tbl_blk in _table_blocks:
-        if cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
-            row_list_s = _blk_str.strip().replace(",", "").split("\n")
-            row_list = np.array([CNT_FCOUNT_DICT[row_list_s[0]], *row_list_s[1:]], int)
-            del row_list_s
-            if row_list[3] != row_list[1] + row_list[2]:
-                raise ValueError(
-                    "Total number of investigations does not equal #enforced plus #closed."
-                )
-            if row_list[0] == TTL_KEY:
-                col_totals = row_list
-            else:
-                invdata_array = (
-                    np.vstack((invdata_array, row_list))
-                    if invdata_array.shape
-                    else row_list
-                )
-        else:
-            continue
-    if not np.array_equal(
-        np.array(list(col_totals[1:]), int), np.einsum("ij->j", invdata_array[:, 1:])
-    ):
-        raise ValueError("Column totals don't compute.")
-    return invdata_array[np.argsort(invdata_array[:, 0])]
-def _download_invdata(_dl_path: Path = FID_WORK_DIR) -> tuple[str, ...]:
-    if not _dl_path.is_dir():
-        _dl_path.mkdir(parents=True)
-    invdata_homepage_urls = (
-        "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
-        "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
-        "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
-        "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
-    )
-    invdata_docnames = (
-        "040831horizmergersdata96-03.pdf",
-        "p035603horizmergerinvestigationdata1996-2005.pdf",
-        "081201hsrmergerdata.pdf",
-        "130104horizontalmergerreport.pdf",
-    )
-    if all(
-        _dl_path.joinpath(invdata_docname).is_file()
-        for invdata_docname in invdata_docnames
-    ):
-        return invdata_docnames
-    invdata_docnames_dl: tuple[str, ...] = ()
-    u3pm = urllib3.PoolManager()
-    chunk_size_ = 1024 * 1024
-    for invdata_homepage_url in invdata_homepage_urls:
-        with u3pm.request(
-            "GET", invdata_homepage_url, preload_content=False
-        ) as _u3handle:
-            invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
-            invdata_attrs = [
-                (_g.get("title", ""), _g.get("href", ""))
-                for _g in invdata_soup.find_all("a")
-                if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
-            ]
-        for invdata_attr in invdata_attrs:
-            invdata_docname, invdata_link = invdata_attr
-            invdata_docnames_dl += (invdata_docname,)
-            with (
-                u3pm.request(
-                    "GET", f"https://www.ftc.gov/{invdata_link}", preload_content=False
-                ) as _urlopen_handle,
-                _dl_path.joinpath(invdata_docname).open("wb") as invdata_fh,
-            ):
-                while True:
-                    data = _urlopen_handle.read(chunk_size_)
-                    if not data:
-                        break
-                    invdata_fh.write(data)
-    return invdata_docnames_dl
 if __name__ == "__main__":
     print(
         "This module defines functions for downloading and preparing FTC merger investigations data for further analysis."

mergeron/core/guidelines_boundary_functions.py CHANGED Viewed

@@ -728,7 +728,10 @@ def lerp[LerpT: (float, MPFloat, ArrayDouble)](
         case _:
             if not 0 <= _r <= 1:
                 raise ValueError("Specified interpolation weight must lie in [0, 1].")
-            return fma(_x2, _r, fma(_x1, -_r, _x1))
+            if isinstance(_x1, np.ndarray) or isinstance(_x2, np.ndarray):
+                return (1 - _r) * _x1 + _r * _x2
+            else:
+                return fma(_x2, _r, fma(_x1, -_r, _x1))
 def round_cust(

mergeron/gen/enforcement_stats.py CHANGED Viewed

@@ -7,8 +7,7 @@ import numpy as np
 from scipy.interpolate import make_interp_spline  # type: ignore
 from .. import VERSION, ArrayBIGINT, Enameled, this_yaml  # noqa: TID252
-from ..core import INVData, INVTableData  # noqa: TID252
-from ..core import ftc_merger_investigations_data as fid  # noqa: TID252
+from ..core import TABLE_TYPES, TTL_KEY, INVData, INVTableData  # noqa: TID252
 from . import INVResolution
 __version__ = VERSION
@@ -117,7 +116,7 @@ ZONE_STRINGS = {
     0: R"Green Zone (Safeharbor)",
     1: R"Yellow Zone",
     2: R"Red Zone (SLC Presumption)",
-    fid.TTL_KEY: "TOTAL",
+    TTL_KEY: "TOTAL",
 }
 ZONE_DETAIL_STRINGS_HHI = {
     0: Rf"HHI < {HHI_POST_ZONE_KNOTS[1]} pts.",
@@ -226,7 +225,7 @@ def enf_cnts_obs_byfirmcount(
             f"Must be one of, {tuple(_data_array_dict.keys())!r}."
         )
-    data_array_dict_sub = _data_array_dict[_data_period][fid.TABLE_TYPES[1]]
+    data_array_dict_sub = _data_array_dict[_data_period][TABLE_TYPES[1]]
     table_no_ = table_no_lku(data_array_dict_sub, _table_ind_group, _table_evid_cond)
@@ -280,7 +279,7 @@ def enf_cnts_obs_byhhianddelta(
             f"Must be one of, {tuple(_data_array_dict.keys())!r}."
         )
-    data_array_dict_sub = _data_array_dict[_data_period][fid.TABLE_TYPES[0]]
+    data_array_dict_sub = _data_array_dict[_data_period][TABLE_TYPES[0]]
     table_no_ = table_no_lku(data_array_dict_sub, _table_ind_group, _table_evid_cond)

{mergeron-2025.739439.15.dist-info → mergeron-2025.739439.20.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: mergeron
-Version: 2025.739439.15
+Version: 2025.739439.20
 Summary: Python for analyzing merger enforcement policy
 License: MIT
 Keywords: merger enforcement policy,merger guidelines,merger screening,enforcement presumptions,concentration standards,diversion ratio,upward pricing pressure,GUPPI
@@ -11,7 +11,6 @@ Classifier: Development Status :: 4 - Beta
 Classifier: Environment :: Console
 Classifier: Intended Audience :: End Users/Desktop
 Classifier: Intended Audience :: Science/Research
-Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: Implementation :: CPython

{mergeron-2025.739439.15.dist-info → mergeron-2025.739439.20.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,10 @@
-mergeron/__init__.py,sha256=Na9jn_e8vhC7BVwW695TUbcS-3yZzSR66eVVgmrGRnI,6734
-mergeron/core/__init__.py,sha256=zJcc50CAwm0oNKwk4p-P7oRwVfB_Fp4u0Do5dUeUXfI,3027
+mergeron/__init__.py,sha256=3TwIpToRlmjqQ5xXRaxLmUOoxpe32Pu4oOYEwJpCShE,6734
+mergeron/core/__init__.py,sha256=3QrvMtE0OPfdvhOHo_B2AopWMZHNGpCAjdx8wuYu1Wk,4584
+mergeron/core/_process_ftc_merger_investigations_data.py,sha256=Ros_Ew9mqBmptvhf3jonS2Et8_OoFhN3ODtO1Q2Xv-c,13657
 mergeron/core/empirical_margin_distribution.py,sha256=61U-KLB563BPWM5zWyWp82c4PhcsAG-IKI0WWYGjBKg,11740
-mergeron/core/ftc_merger_investigations_data.py,sha256=oM4cs2PnyeSwyV1LOE_EYCUEzCKPm7lnCGxLIc6JQY8,28820
+mergeron/core/ftc_merger_investigations_data.py,sha256=JjnsSlBBQX_-aiQuVEOInuMpsETbx6aXDUSTikCfvns,14552
 mergeron/core/guidelines_boundaries.py,sha256=Z8rZvhHrxXBgrLGFpb6yldc8h3lN9rGtGj4yu-fyVBA,15450
-mergeron/core/guidelines_boundary_functions.py,sha256=fdWbqOb3Khz0OkbSTO7amE1q_ao3puZY5tEzj0p4h1o,30695
+mergeron/core/guidelines_boundary_functions.py,sha256=zgKHOWZcPuI6hbTkHb7O5YxYW8rCfZJATHM_gmdVhjw,30841
 mergeron/core/pseudorandom_numbers.py,sha256=CFp-8eu0q2g-81LA0k2oCFltmp6Er7EkrAkoG19G7Os,10138
 mergeron/data/__init__.py,sha256=SAFkR23RBM0zwGam2TeWmw08oHAKmU2YF-Nygj73ies,1845
 mergeron/data/damodaran_margin_data_serialized.zip,sha256=Wc1v9buSrYTWWAravG8W9nPbgsU07zMtSAR2RvMQU5s,623482
@@ -11,11 +12,11 @@ mergeron/data/ftc_merger_investigations_data.zip,sha256=tiB2TLFyS9LMSFIv8DBA_oEE
 mergeron/gen/__init__.py,sha256=6xUhaG4kWj2Qx8hLjgjupFWcJ0ZzAKDY9TN7mAFrANI,23880
 mergeron/gen/data_generation.py,sha256=cZW3Dc6bNiBUPXjTDHZDwTc6x1sxXq2STCzfsvk6_tw,17638
 mergeron/gen/data_generation_functions.py,sha256=SWzZ3I7ulkGBcL2F5CCKw2IvCm_wEplvqBasnSjSyU0,26129
-mergeron/gen/enforcement_stats.py,sha256=V3ZeVv-iLFUuKPeM503cMKiVVaYeGVrA_6lInAdXA5w,14387
+mergeron/gen/enforcement_stats.py,sha256=2MNEMxBgeIennS8hsiN-33aVEA_qGHy5hIh9FSxl0no,14324
 mergeron/gen/upp_tests.py,sha256=v-tnhQ85j8zL-TTE52GC61GEZSRFfdCkjaitVQIz0FI,6464
 mergeron/perks/__init__.py,sha256=gGRIuRc7I6OuWLzwSiSZSIE0PEoxAy2DRFWg0VVLlbE,484
 mergeron/perks/guidelines_boundary_functions_extra.py,sha256=q-Cqk9t5oj4yiAsmZJcsfrH434oGvza4YVspFYpdV0g,22113
 mergeron/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
-mergeron-2025.739439.15.dist-info/METADATA,sha256=Lv6ufuDIli7g_dr5cuqEK3sFy-ggTqr1R61_XMm-_QQ,4167
-mergeron-2025.739439.15.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-mergeron-2025.739439.15.dist-info/RECORD,,
+mergeron-2025.739439.20.dist-info/METADATA,sha256=BVG7gVVtnU50p-M9R_TFcwc6QSc2hirvoBJMuKihA1w,4116
+mergeron-2025.739439.20.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+mergeron-2025.739439.20.dist-info/RECORD,,

{mergeron-2025.739439.15.dist-info → mergeron-2025.739439.20.dist-info}/WHEEL RENAMED Viewed

File without changes

mergeron 2025.739439.15__py3-none-any.whl → 2025.739439.20__py3-none-any.whl

Potentially problematic release.

mergeron 2025.739439.15py3-none-any.whl → 2025.739439.20py3-none-any.whl