PyPI - mergeron - Versions diffs - 2024.738972.0__py3-none-any.whl → 2024.739079.9__py3-none-any.whl - Mend

mergeron 2024.738972.0py3-none-any.whl → 2024.739079.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mergeron might be problematic. Click here for more details.

Files changed (37) hide show

mergeron/__init__.py CHANGED Viewed

@@ -1,12 +1,19 @@
 from __future__ import annotations
 import enum
-from importlib.metadata import version
 from pathlib import Path
+from typing import Any
+import numpy as np
+import pendulum  # type: ignore
+from icecream import argumentToString, ic, install  # type: ignore
+from numpy.typing import NDArray
 _PKG_NAME: str = Path(__file__).parent.stem
-__version__ = version(_PKG_NAME)
+VERSION = "2024.739079.9"
+__version__ = VERSION
 DATA_DIR: Path = Path.home() / _PKG_NAME
 """
@@ -14,11 +21,26 @@ Defines a subdirectory named for this package in the user's home path.
 If the subdirectory doesn't exist, it is created on package invocation.
 """
 if not DATA_DIR.is_dir():
     DATA_DIR.mkdir(parents=False)
+np.set_printoptions(precision=18)
+def _timestamper() -> str:
+    return f"{pendulum.now().strftime("%F %T.%f")} |>  "
+@argumentToString.register(np.ndarray)  # type: ignore
+def _(_obj: NDArray[Any]) -> str:
+    return f"ndarray, shape={_obj.shape}, dtype={_obj.dtype}"
+ic.configureOutput(prefix=_timestamper, includeContext=True)
+install()
 @enum.unique
 class RECConstants(enum.StrEnum):
     """Recapture rate - derivation methods."""
@@ -38,8 +60,11 @@ class UPPAggrSelector(enum.StrEnum):
     AVG = "average"
     CPA = "cross-product-share weighted average"
     CPD = "cross-product-share weighted distance"
+    CPG = "cross-product-share weighted geometric mean"
     DIS = "symmetrically-weighted distance"
+    GMN = "geometric mean"
     MAX = "max"
     MIN = "min"
     OSA = "own-share weighted average"
     OSD = "own-share weighted distance"
+    OSG = "own-share weighted geometric mean"

mergeron/core/__init__.py CHANGED Viewed

@@ -1,68 +1,3 @@
-from __future__ import annotations
+from .. import VERSION  # noqa: TID252
-from importlib.metadata import version
-from attrs import Attribute, field, frozen, validators
-from .. import _PKG_NAME, RECConstants, UPPAggrSelector  # noqa: TID252
-__version__ = version(_PKG_NAME)
-def _delta_value_validator(
-    _instance: UPPBoundarySpec, _attribute: Attribute[float], _value: float, /
-) -> None:
-    if not 0 <= _value <= 1:
-        raise ValueError(
-            "Margin-adjusted benchmark share ratio must lie between 0 and 1."
-        )
-def _rec_spec_validator(
-    _instance: UPPBoundarySpec,
-    _attribute: Attribute[RECConstants],
-    _value: RECConstants,
-    /,
-) -> None:
-    if _value == RECConstants.OUTIN:
-        raise ValueError(
-            f"Invalid recapture specification, {_value!r}. "
-            "You may consider specifying RECConstants.INOUT here, and "
-            "assigning the recapture rate for the merging-firm with "
-            'the smaller market-share to the attribue, "rec" of '
-            "the UPPBoundarySpec object you pass."
-        )
-    if _value is None and _instance.agg_method != UPPAggrSelector.MAX:
-        raise ValueError(
-            f"Specified aggregation method, {_instance.agg_method} requires a recapture specification."
-        )
-@frozen
-class UPPBoundarySpec:
-    share_ratio: float = field(
-        kw_only=False,
-        default=0.075,
-        validator=(validators.instance_of(float), _delta_value_validator),
-    )
-    rec: float = field(
-        kw_only=False, default=0.80, validator=validators.instance_of(float)
-    )
-    agg_method: UPPAggrSelector = field(
-        kw_only=True,
-        default=UPPAggrSelector.MAX,
-        validator=validators.instance_of(UPPAggrSelector),
-    )
-    recapture_form: RECConstants | None = field(
-        kw_only=True,
-        default=RECConstants.INOUT,
-        validator=(
-            validators.optional(validators.instance_of(RECConstants)),  # type: ignore
-            _rec_spec_validator,
-        ),
-    )
-    precision: int = field(
-        kw_only=False, default=5, validator=validators.instance_of(int)
-    )
+__version__ = VERSION

mergeron/core/damodaran_margin_data.py CHANGED Viewed

@@ -7,7 +7,8 @@ Data are downloaded or reused from a local copy, on demand.
 For terms of use of Prof. Damodaran's data, please see:
 https://pages.stern.nyu.edu/~adamodar/New_Home_Page/datahistory.html
-Important caveats:
+NOTES
+-----
 Prof. Damodaran notes that the data construction may not be
 consistent from iteration to iteration. He also notes that,
@@ -32,29 +33,30 @@ price-cost margins fall in the interval :math:`[0, 1]`.
 """
+import shutil
 from collections.abc import Mapping
-from importlib.metadata import version
+from importlib import resources
 from pathlib import Path
 from types import MappingProxyType
 import msgpack  # type:ignore
 import numpy as np
-import requests
+import urllib3
 from numpy.random import PCG64DXSM, Generator, SeedSequence
 from numpy.typing import NDArray
-from requests_toolbelt.downloadutils import stream  # type: ignore
 from scipy import stats  # type: ignore
 from xlrd import open_workbook  # type: ignore
-from .. import _PKG_NAME, DATA_DIR  # noqa: TID252
-__version__ = version(_PKG_NAME)
+from .. import _PKG_NAME, DATA_DIR, VERSION  # noqa: TID252
+__version__ = VERSION
 MGNDATA_ARCHIVE_PATH = DATA_DIR / "damodaran_margin_data_dict.msgpack"
+u3pm = urllib3.PoolManager()
-def scrape_data_table(
+def mgn_data_getter(
     _table_name: str = "margin",
     *,
     data_archive_path: Path | None = None,
@@ -68,32 +70,46 @@ def scrape_data_table(
     _data_archive_path = data_archive_path or MGNDATA_ARCHIVE_PATH
     _mgn_urlstr = f"https://pages.stern.nyu.edu/~adamodar/pc/datasets/{_table_name}.xls"
-    _mgn_path = _data_archive_path.parent.joinpath(f"damodaran_{_table_name}_data.xls")
+    _mgn_path = _data_archive_path.parent / f"damodaran_{_table_name}_data.xls"
     if _data_archive_path.is_file() and not data_download_flag:
         return MappingProxyType(msgpack.unpackb(_data_archive_path.read_bytes()))
     elif _mgn_path.is_file():
         _mgn_path.unlink()
-        _data_archive_path.unlink()
-    _REQ_TIMEOUT = (9.05, 27)
-    # NYU will eventually updates its server certificate, to one signed with
-    #   "InCommon RSA Server CA 2.pem", the step below will be obsolete. In
-    #   the interim, it is necessary to provide the certificate chain to the
-    #   root CA, so that the obsolete CA certificate is validated.
-    _INCOMMON_2014_CERT_CHAIN_PATH = (
-        Path(__file__).parent / "InCommon RSA Server CA cert chain.pem"
-    )
-    try:
-        _urlopen_handle = requests.get(_mgn_urlstr, timeout=_REQ_TIMEOUT, stream=True)
-    except requests.exceptions.SSLError:
-        _urlopen_handle = requests.get(
-            _mgn_urlstr,
-            timeout=_REQ_TIMEOUT,
-            stream=True,
-            verify=str(_INCOMMON_2014_CERT_CHAIN_PATH),
-        )
+        if _data_archive_path.is_file():
+            _data_archive_path.unlink()
-    _mgn_filename = stream.stream_response_to_file(_urlopen_handle, path=_mgn_path)
+    try:
+        _chunk_size = 1024 * 1024
+        with (
+            u3pm.request("GET", _mgn_urlstr, preload_content=False) as _urlopen_handle,
+            _mgn_path.open("wb") as _mgn_file,
+        ):
+            while True:
+                _data = _urlopen_handle.read(_chunk_size)
+                if not _data:
+                    break
+                _mgn_file.write(_data)
+        print(f"Downloaded {_mgn_urlstr} to {_mgn_path}.")
+    except urllib3.exceptions.MaxRetryError as _err:
+        if isinstance(_err.__cause__, urllib3.exceptions.SSLError):
+            # Works fine with other sites secured with certificates
+            # from the Internet2 CA, such as,
+            # https://snap.stanford.edu/data/web-Stanford.txt.gz
+            print(
+                f"WARNING: Could not establish secure connection to, {_mgn_urlstr}."
+                "Using bundled copy."
+            )
+            if not _mgn_path.is_file():
+                with resources.as_file(
+                    resources.files(f"{_PKG_NAME}.data").joinpath(
+                        "damodaran_margin_data.xls"
+                    )
+                ) as _mgn_data_archive_path:
+                    shutil.copy2(_mgn_data_archive_path, _mgn_path)
+        else:
+            raise _err
     _xl_book = open_workbook(_mgn_path, ragged_rows=True, on_demand=True)
     _xl_sheet = _xl_book.sheet_by_name("Industry Averages")
@@ -123,7 +139,7 @@ def mgn_data_builder(
     _mgn_tbl_dict: Mapping[str, Mapping[str, float | int]] | None = None, /
 ) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
     if _mgn_tbl_dict is None:
-        _mgn_tbl_dict = scrape_data_table()
+        _mgn_tbl_dict = mgn_data_getter()
     _mgn_data_wts, _mgn_data_obs = (
         _f.flatten()
@@ -169,17 +185,19 @@ def mgn_data_builder(
     )
-def resample_mgn_data(
+def mgn_data_resampler(
     _sample_size: int | tuple[int, int] = (10**6, 2),
     /,
     *,
     seed_sequence: SeedSequence | None = None,
 ) -> NDArray[np.float64]:
     """
-    Generate the specified number of draws from the empirical distribution
-    for Prof. Damodaran's margin data using the estimated Gaussian KDE.
-    Margins for firms in finance, investment, insurance, reinsurance, and REITs
-    are excluded from the sample used to estimate the Gaussian KDE.
+    Generate draws from the empirical distribution bassed on Prof. Damodaran's margin data.
+    The empirical distribution is estimated using a Gaussian KDE; the bandwidth
+    selected using Silverman's rule is narrowed to reflect that the margin data
+    are multimodal. Margins for firms in finance, investment, insurance, reinsurance, and
+    REITs are excluded from the sample used to estimate the empirical distribution.
     Parameters
     ----------
@@ -198,28 +216,24 @@ def resample_mgn_data(
     _seed_sequence = seed_sequence or SeedSequence(pool_size=8)
-    _x, _w, _ = mgn_data_builder(scrape_data_table())
+    _x, _w, _ = mgn_data_builder(mgn_data_getter())
-    _mgn_kde = stats.gaussian_kde(_x, weights=_w)
+    _mgn_kde = stats.gaussian_kde(_x, weights=_w, bw_method="silverman")
+    _mgn_kde.set_bandwidth(bw_method=_mgn_kde.factor / 3.0)
-    def _generate_draws(
-        _mgn_kde: stats.gaussian_kde, _ssz: int, _seed_seq: SeedSequence
-    ) -> NDArray[np.float64]:
-        _seed = Generator(PCG64DXSM(_seed_sequence))
-        # We enlarge the sample, then truncate to
-        # the range between [0.0, 1.0)
-        ssz_up = int(_ssz / (_mgn_kde.integrate_box_1d(0.0, 1.0) ** 2))
-        sample_1 = _mgn_kde.resample(ssz_up, seed=_seed)[0]
+    if isinstance(_sample_size, int):
         return np.array(
-            sample_1[(sample_1 >= 0.0) & (sample_1 <= 1)][:_ssz], np.float64
+            _mgn_kde.resample(_sample_size, seed=Generator(PCG64DXSM(_seed_sequence)))[
+                0
+            ]
         )
-    if isinstance(_sample_size, int):
-        return _generate_draws(_mgn_kde, _sample_size, _seed_sequence)
-    else:
+    elif isinstance(_sample_size, tuple) and len(_sample_size) == 2:
         _ssz, _num_cols = _sample_size
         _ret_array = np.empty(_sample_size, np.float64)
         for _idx, _seed_seq in enumerate(_seed_sequence.spawn(_num_cols)):
-            _ret_array[:, _idx] = _generate_draws(_mgn_kde, _ssz, _seed_seq)
+            _ret_array[:, _idx] = _mgn_kde.resample(
+                _ssz, seed=Generator(PCG64DXSM(_seed_seq))
+            )[0]
         return _ret_array
+    else:
+        raise ValueError(f"Invalid sample size: {_sample_size!r}")

mergeron/core/excel_helper.py CHANGED Viewed

@@ -8,29 +8,21 @@ Includes a flexible system of defining cell formats.
 from __future__ import annotations
-import enum
 from collections.abc import Mapping, Sequence
-from importlib.metadata import version
-from types import MappingProxyType
-from typing import Any
+from typing import Any, ClassVar
 import numpy as np
 import numpy.typing as npt
 import xlsxwriter  # type: ignore
+from aenum import Enum, unique  # type: ignore
-from .. import _PKG_NAME  # noqa: TID252
+from .. import VERSION  # noqa: TID252
-__version__ = version(_PKG_NAME)
+__version__ = VERSION
-@enum.unique
-class CFmtParent(dict[str, Any], enum.ReprEnum):  # type: ignore
-    """Unique mappings defining xlsxwirter Workbook formats"""
-    ...
-class CFmt(CFmtParent):
+@unique
+class CFmt(Enum):  # type: ignore
     """
     Initialize cell formats for xlsxwriter.
@@ -42,31 +34,34 @@ class CFmt(CFmtParent):
     See, https://xlsxwriter.readthedocs.io/format.html
     """
-    XL_DEFAULT = MappingProxyType({"font_name": "Calibri", "font_size": 11})
-    XL_DEFAULT_2003 = MappingProxyType({"font_name": "Arial", "font_size": 10})
+    XL_DEFAULT: ClassVar = {"font_name": "Calibri", "font_size": 11}
+    XL_DEFAULT_2003: ClassVar = {"font_name": "Arial", "font_size": 10}
+    A_CTR: ClassVar = {"align": "center"}
+    A_CTR_ACROSS: ClassVar = {"align": "center_across"}
+    A_LEFT: ClassVar = {"align": "left"}
+    A_RIGHT: ClassVar = {"align": "right"}
-    A_CTR = MappingProxyType({"align": "center"})
-    A_CTR_ACROSS = MappingProxyType({"align": "center_across"})
-    A_LEFT = MappingProxyType({"align": "left"})
-    A_RIGHT = MappingProxyType({"align": "right"})
+    BOLD: ClassVar = {"bold": True}
+    ITALIC: ClassVar = {"italic": True}
+    ULINE: ClassVar = {"underline": True}
-    BOLD = MappingProxyType({"bold": True})
-    ITALIC = MappingProxyType({"italic": True})
-    ULINE = MappingProxyType({"underline": True})
+    TEXT_WRAP: ClassVar = {"text_wrap": True}
+    TEXT_ROTATE: ClassVar = {"rotation": 90}
+    IND_1: ClassVar = {"indent": 1}
-    TEXT_WRAP = MappingProxyType({"text_wrap": True})
-    IND_1 = MappingProxyType({"indent": 1})
+    DOLLAR_NUM: ClassVar = {"num_format": "[$$-409]#,##0.00"}
+    DT_NUM: ClassVar = {"num_format": "mm/dd/yyyy"}
+    QTY_NUM: ClassVar = {"num_format": "#,##0.0"}
+    PCT_NUM: ClassVar = {"num_format": "##0.000000%"}
+    AREA_NUM: ClassVar = {"num_format": "0.00000000"}
-    DOLLAR_NUM = MappingProxyType({"num_format": "[$$-409]#,##0.00"})
-    DT_NUM = MappingProxyType({"num_format": "mm/dd/yyyy"})
-    QTY_NUM = MappingProxyType({"num_format": "#,##0.0"})
-    PCT_NUM = MappingProxyType({"num_format": "##0.000000%"})
-    AREA_NUM = MappingProxyType({"num_format": "0.00000000"})
+    BAR_FILL: ClassVar = {"pattern": 1, "bg_color": "dfeadf"}
+    HDR_FILL: ClassVar = {"pattern": 1, "bg_color": "999999"}
-    BAR_FILL = MappingProxyType({"pattern": 1, "bg_color": "dfeadf"})
-    BOT_BORDER = MappingProxyType({"bottom": 1, "bottom_color": "000000"})
-    TOP_BORDER = MappingProxyType({"top": 1, "top_color": "000000"})
-    HDR_BORDER = TOP_BORDER | BOT_BORDER
+    BOT_BORDER: ClassVar = {"bottom": 1, "bottom_color": "000000"}
+    TOP_BORDER: ClassVar = {"top": 1, "top_color": "000000"}
+    HDR_BORDER: ClassVar = TOP_BORDER | BOT_BORDER
 def matrix_to_sheet(
@@ -208,7 +203,7 @@ def xl_fmt(
     _xl_book: xlsxwriter.Workbook, _cell_fmt: Sequence[CFmt] | CFmt | None, /
 ) -> xlsxwriter.format.Format:
     """
-    Return :code:`xlsxwriter` `Format` object given a CFmt enum, or tuple thereof.
+    Return :code:`xlsxwriter` `Format` object given a CFmt aenum, or tuple thereof.
     Parameters
     ----------
@@ -216,14 +211,14 @@ def xl_fmt(
         :code:`xlsxwriter.Workbook` object
     _cell_fmt
-        :code:`CFmt` enum object, or tuple thereof
+        :code:`CFmt` aenum object, or tuple thereof
     Returns
     -------
         :code:`xlsxwriter` `Format`  object
     """
-    _cell_fmt_dict: Mapping[str, Any] = MappingProxyType({})
+    _cell_fmt_dict: Mapping[str, Any] = {}
     if isinstance(_cell_fmt, tuple):
         ensure_cell_format_spec_tuple(_cell_fmt)
         for _cf in _cell_fmt:

mergeron/core/ftc_merger_investigations_data.py CHANGED Viewed

@@ -4,13 +4,13 @@ as necessary
 NOTES
 -----
-We drop reported row and column totals from source data for reducing stored data.
+Reported row and column totals from source data are not stored.
 """
 import shutil
 from collections.abc import Mapping, Sequence
-from importlib.metadata import version
+from importlib import resources
 from operator import itemgetter
 from pathlib import Path
 from types import MappingProxyType
@@ -22,12 +22,13 @@ import numpy as np
 import re2 as re  # type: ignore
 import requests
 from bs4 import BeautifulSoup
+from icecream import ic  # type: ignore
 from numpy.testing import assert_array_equal
 from numpy.typing import NDArray
-from .. import _PKG_NAME, DATA_DIR  # noqa: TID252
+from .. import _PKG_NAME, DATA_DIR, VERSION  # noqa: TID252
-__version__ = version(_PKG_NAME)
+__version__ = VERSION
 m.patch()
@@ -36,11 +37,16 @@ if not FTCDATA_DIR.is_dir():
     FTCDATA_DIR.mkdir(parents=True)
 INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.msgpack"
-if not INVDATA_ARCHIVE_PATH.is_file():
-    if (
-        _bundled_copy := Path(__file__).parent.joinpath(INVDATA_ARCHIVE_PATH.name)
-    ).is_file():
-        shutil.copyfile(_bundled_copy, INVDATA_ARCHIVE_PATH)
+if (
+    not INVDATA_ARCHIVE_PATH.is_file()
+    and (
+        _bundled_copy := resources.files(f"{_PKG_NAME}.data").joinpath(
+            INVDATA_ARCHIVE_PATH.name
+        )
+    ).is_file()
+):
+    with resources.as_file(_bundled_copy) as _bundled_copy_path:
+        shutil.copy2(_bundled_copy_path, INVDATA_ARCHIVE_PATH)
 TABLE_NO_RE = re.compile(r"Table \d+\.\d+")
 TABLE_TYPES = ("ByHHIandDelta", "ByFirmCount")
@@ -86,8 +92,8 @@ CNT_FCOUNT_DICT = {
 class INVTableData(NamedTuple):
-    ind_grp: str
-    evid_cond: str
+    industry_group: str
+    additional_evidence: str
     data_array: NDArray[np.int64]
@@ -181,7 +187,9 @@ def construct_data(
             _aggr_tables_list = [
                 _t
                 for _t in _invdata["1996-2003"][_table_type]
-                if re.sub(r"\W", "", _invdata["1996-2003"][_table_type][_t].ind_grp)
+                if re.sub(
+                    r"\W", "", _invdata["1996-2003"][_table_type][_t].industry_group
+                )
                 not in _industry_exclusion_list
             ]
@@ -254,8 +262,8 @@ def _construct_new_period_data(
         for _table_no in _invdata_cuml[_table_type]:
             _invdata_cuml_sub_table = _invdata_cuml[_table_type][_table_no]
             _invdata_ind_group, _invdata_evid_cond, _invdata_cuml_array = (
-                _invdata_cuml_sub_table.ind_grp,
-                _invdata_cuml_sub_table.evid_cond,
+                _invdata_cuml_sub_table.industry_group,
+                _invdata_cuml_sub_table.additional_evidence,
                 _invdata_cuml_sub_table.data_array,
             )
@@ -337,7 +345,7 @@ def _construct_new_period_data(
                 #   _invdata_array_bld_enfcls < 0, _invdata_array_bld_enfcls, 0
                 # )
                 # if np.einsum('ij->', invdata_array_bld_tbc):
-                #     print(
+                #     ic(
                 #       f"{_data_period}, {_table_no}, {_invdata_ind_group}:",
                 #       abs(np.einsum('ij->', invdata_array_bld_tbc))
                 #       )
@@ -395,22 +403,23 @@ def _parse_invdata() -> INVData:
         by range of HHI and ∆HHI.
     """
+    raise ValueError(
+        "This function is defined here as documentation.\n"
+        "NOTE: License for `pymupdf`, upon which this function depends,"
+        " may be incompatible with the MIT license,"
+        " under which this pacakge is distributed."
+        " Making this fumction operable requires the user to modify"
+        " the source code as well as to install an additional package"
+        " not distributed with this package or included in its dependencies."
+    )
     import fitz  # type: ignore
-    # user must install pymupdf to make this function operable
-    _invdata_docnames: Sequence[str] = (
-        "040831horizmergersdata96-03.pdf",
-        "p035603horizmergerinvestigationdata1996-2005.pdf",
-        "081201hsrmergerdata.pdf",
-        "130104horizontalmergerreport.pdf",
-    )
+    _invdata_docnames = _download_invdata(FTCDATA_DIR)
     _invdata: dict[str, dict[str, dict[str, INVTableData]]] = {}
     for _invdata_docname in _invdata_docnames:
         _invdata_pdf_path = FTCDATA_DIR.joinpath(_invdata_docname)
-        if not _invdata_pdf_path.is_file():
-            _download_invdata(FTCDATA_DIR)
         _invdata_fitz = fitz.open(_invdata_pdf_path)
         _invdata_meta = _invdata_fitz.metadata
@@ -542,7 +551,7 @@ def _parse_table_blocks(
         _invdata_evid_cond = "Unrestricted on additional evidence"
     else:
-        # print(_table_blocks)
+        # ic(_table_blocks)
         _invdata_evid_cond = (
             _table_blocks[1][-3].strip()
             if _table_ser == 9
@@ -561,8 +570,8 @@ def _parse_table_blocks(
     _table_array = process_table_func(_table_blocks)
     if not isinstance(_table_array, np.ndarray) or _table_array.dtype != np.int64:
-        print(_table_num)
-        print(_table_blocks)
+        ic(_table_num)
+        ic(_table_blocks)
         raise ValueError
     _table_data = INVTableData(_invdata_ind_group, _invdata_evid_cond, _table_array)
@@ -610,7 +619,7 @@ def _process_table_blks_conc_type(
                 _col_totals = _row_array
             else:
                 _invdata_array = (
-                    np.row_stack((_invdata_array, _row_array))
+                    np.vstack((_invdata_array, _row_array))
                     if _invdata_array.shape
                     else _row_array
                 )
@@ -657,7 +666,7 @@ def _process_table_blks_cnt_type(
                 _col_totals = _row_list
             else:
                 _invdata_array = (
-                    np.row_stack((_invdata_array, _row_list))
+                    np.vstack((_invdata_array, _row_list))
                     if _invdata_array.shape
                     else _row_list
                 )
@@ -673,27 +682,43 @@ def _process_table_blks_cnt_type(
     return _invdata_array[np.argsort(_invdata_array[:, 0])]
-def _download_invdata(_dl_path: Path) -> list[Any]:
+def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
+    if not _dl_path.is_dir():
+        _dl_path.mkdir(parents=True)
     _invdata_homepage_urls = (
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
         "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
     )
-    _invdata_docnames = []
+    _invdata_docnames = (
+        "040831horizmergersdata96-03.pdf",
+        "p035603horizmergerinvestigationdata1996-2005.pdf",
+        "081201hsrmergerdata.pdf",
+        "130104horizontalmergerreport.pdf",
+    )
+    if all(
+        _dl_path.joinpath(_invdata_docname).is_file()
+        for _invdata_docname in _invdata_docnames
+    ):
+        return _invdata_docnames
+    _invdata_docnames_dl: tuple[str, ...] = ()
     for _invdata_homepage_url in _invdata_homepage_urls:
         _invdata_soup = BeautifulSoup(
             requests.get(_invdata_homepage_url, verify=True, timeout=60).text,
             "html.parser",
         )
         _invdata_attrs = [
-            (_g.get("href", ""), _g.get("title", ""))
+            (_g.get("title", ""), _g.get("href", ""))
             for _g in _invdata_soup.find_all("a")
             if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
         ]
         for _invdata_attr in _invdata_attrs:
-            _invdata_link, _invdata_docname = _invdata_attr
-            _invdata_docnames += [_invdata_docname]
+            _invdata_docname, _invdata_link = _invdata_attr
+            _invdata_docnames_dl += (_invdata_docname,)
             with _dl_path.joinpath(_invdata_docname).open("wb") as _invdata_fh:
                 _invdata_fh.write(
                     requests.get(
@@ -701,4 +726,10 @@ def _download_invdata(_dl_path: Path) -> list[Any]:
                     ).content
                 )
-    return _invdata_docnames
+    return _invdata_docnames_dl
+if __name__ == "__main__":
+    print(
+        "This module defines functions for downloading and preparing FTC merger investigations data for further analysis."
+    )

mergeron 2024.738972.0__py3-none-any.whl → 2024.739079.9__py3-none-any.whl

Potentially problematic release.

mergeron 2024.738972.0py3-none-any.whl → 2024.739079.9py3-none-any.whl