PyPI - mergeron - Versions diffs - 2025.739290.3__py3-none-any.whl → 2025.739290.5__py3-none-any.whl - Mend

mergeron 2025.739290.3py3-none-any.whl → 2025.739290.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mergeron might be problematic. Click here for more details.

Files changed (21) hide show

mergeron/__init__.py +103 -48
mergeron/core/__init__.py +105 -4
mergeron/core/empirical_margin_distribution.py +100 -78
mergeron/core/ftc_merger_investigations_data.py +309 -316
mergeron/core/guidelines_boundaries.py +67 -138
mergeron/core/guidelines_boundary_functions.py +202 -379
mergeron/core/guidelines_boundary_functions_extra.py +264 -106
mergeron/core/pseudorandom_numbers.py +73 -64
mergeron/data/damodaran_margin_data_serialized.zip +0 -0
mergeron/data/ftc_invdata.zip +0 -0
mergeron/demo/visualize_empirical_margin_distribution.py +9 -7
mergeron/gen/__init__.py +138 -161
mergeron/gen/data_generation.py +181 -149
mergeron/gen/data_generation_functions.py +220 -237
mergeron/gen/enforcement_stats.py +78 -109
mergeron/gen/upp_tests.py +119 -194
{mergeron-2025.739290.3.dist-info → mergeron-2025.739290.5.dist-info}/METADATA +2 -3
mergeron-2025.739290.5.dist-info/RECORD +24 -0
{mergeron-2025.739290.3.dist-info → mergeron-2025.739290.5.dist-info}/WHEEL +1 -1
mergeron/data/damodaran_margin_data_dict.msgpack +0 -0
mergeron-2025.739290.3.dist-info/RECORD +0 -23

mergeron/gen/data_generation.py CHANGED Viewed

@@ -5,6 +5,8 @@ Methods to generate data for analyzing merger enforcement policy.
 from __future__ import annotations
+import zipfile
+from itertools import starmap
 from typing import TypedDict
 import numpy as np
@@ -13,7 +15,14 @@ from joblib import Parallel, cpu_count, delayed  # type: ignore
 from numpy.random import SeedSequence
 from ruamel import yaml
-from .. import NTHREADS, VERSION, RECForm, this_yaml  # noqa: TID252  # noqa
+from .. import (  # noqa: TID252  # noqa
+    _PKG_NAME,
+    NTHREADS,
+    VERSION,
+    RECForm,
+    this_yaml,
+    yaml_rt_mapper,
+)
 from ..core import guidelines_boundaries as gbl  # noqa: TID252
 from ..core.guidelines_boundaries import HMGThresholds  # noqa: TID252
 from . import (
@@ -34,10 +43,12 @@ from .data_generation_functions import (
     gen_margin_price_data,
     gen_share_data,
 )
-from .upp_tests import compute_upp_test_counts
+from .upp_tests import compute_upp_test_counts  # type: ignore # has pytypes marker ...
 __version__ = VERSION
+H5_CHUNK_SIZE = 10**6
 class SamplingFunctionKWArgs(TypedDict, total=False):
     "Keyword arguments of sampling methods defined below"
@@ -56,22 +67,15 @@ def _seed_data_conv(_v: SeedSequenceData | None, _i: MarketSample) -> SeedSequen
     if isinstance(_v, SeedSequenceData):
         return _v
-    _mktshr_dist_type = _i.share_spec.dist_type
-    _price_spec = _i.price_spec
-    _seed_count = 2 if _mktshr_dist_type == SHRDistribution.UNI else 3
-    _seed_count += 1 if _price_spec == PriceSpec.ZERO else 0
-    _sseq_list = tuple(SeedSequence(pool_size=8) for _ in range(_seed_count))
-    _mktshr_rng_seed_seq, _pcm_rng_seed_seq = _sseq_list[:2]
-    _fcount_rng_seed_seq = (
-        None if _mktshr_dist_type == SHRDistribution.UNI else _sseq_list[2]
-    )
-    _pr_rng_seed_seq = _sseq_list[-1] if _price_spec == PriceSpec.ZERO else None
+    _sseq = tuple(SeedSequence(pool_size=8) for _ in range(4))
+    _sdtt = _i.share_spec.dist_type == SHRDistribution.UNI
+    _pst = _i.price_spec == PriceSpec.RNG
     return SeedSequenceData(
-        _mktshr_rng_seed_seq, _pcm_rng_seed_seq, _fcount_rng_seed_seq, _pr_rng_seed_seq
+        share=_sseq[0],
+        pcm=_sseq[1],
+        fcounts=(None if _sdtt else _sseq[2]),
+        price=(None if not _pst else (_sseq[2] if _sdtt else _sseq[3])),
     )
@@ -141,7 +145,7 @@ class MarketSample:
                 "Set seed_data.fcounts to None and retry."
             )
-        if _i.price_spec != PriceSpec.ZERO and _v.price is not None:
+        if _i.price_spec != PriceSpec.RNG and _v.price is not None:
             raise ValueError(
                 "Attribute, seed_data.price is ignored as irrelevant unless "
                 "prices are asymmetric and uncorrelated and price-cost margins "
@@ -151,12 +155,12 @@ class MarketSample:
     nthreads: int = field(default=NTHREADS, validator=validators.instance_of(int))
     """number of parallel threads to use"""
-    data: MarketSampleData | None = field(default=None)
+    dataset: MarketSampleData | None = field(default=None, init=False)
-    enf_counts: UPPTestsCounts | None = field(default=None)
+    enf_counts: UPPTestsCounts | None = field(default=None, init=False)
     def _gen_market_sample(
-        self, /, *, sample_size: int, seed_data: SeedSequenceData | None, nthreads: int
+        self, /, *, sample_size: int, seed_data: SeedSequenceData, nthreads: int
     ) -> MarketSampleData:
         """
         Generate share, diversion ratio, price, and margin data for MarketSpec.
@@ -170,108 +174,81 @@ class MarketSample:
         """
-        _recapture_form = self.share_spec.recapture_form
-        _recapture_ratio = self.share_spec.recapture_ratio
-        _dist_type_mktshr = self.share_spec.dist_type
-        _dist_firm2_pcm = self.pcm_spec.firm2_pcm_constraint
-        _hsr_filing_test_type = self.hsr_filing_test_type
-        _seed_data = seed_data or self.seed_data
-        (
-            _mktshr_rng_seed_seq,
-            _pcm_rng_seed_seq,
-            _fcount_rng_seed_seq,
-            _pr_rng_seed_seq,
-        ) = (getattr(_seed_data, _a) for _a in _seed_data.__dataclass_fields__)
-        _shr_sample_size = 1.0 * (sample_size or self.sample_size)
         # Scale up sample size to offset discards based on specified criteria
-        _shr_sample_size *= _hsr_filing_test_type
-        if _dist_firm2_pcm == FM2Constraint.MNL:
-            _shr_sample_size *= SSZConstant.MNL_DEP
-        _shr_sample_size = int(_shr_sample_size)
+        shr_sample_size = sample_size * self.hsr_filing_test_type
+        shr_sample_size *= (
+            SSZConstant.MNL_DEP
+            if self.pcm_spec.firm2_pcm_constraint == FM2Constraint.MNL
+            else 1
+        )
+        shr_sample_size = int(shr_sample_size)
         # Generate share data
-        _mktshr_data = gen_share_data(
-            _shr_sample_size,
+        mktshr_data = gen_share_data(
+            shr_sample_size,
             self.share_spec,
-            _fcount_rng_seed_seq,
-            _mktshr_rng_seed_seq,
-            nthreads or self.nthreads,
-        )
-        _mktshr_array, _fcounts, _aggregate_purchase_prob, _nth_firm_share = (
-            getattr(_mktshr_data, _f)
-            for _f in (
-                "mktshr_array",
-                "fcounts",
-                "aggregate_purchase_prob",
-                "nth_firm_share",
-            )
+            seed_data.fcounts,
+            seed_data.share,
+            nthreads,
         )
+        mktshr_array_ = mktshr_data.mktshr_array
+        fcounts_ = mktshr_data.fcounts
+        aggregate_purchase_prob_ = mktshr_data.aggregate_purchase_prob
+        nth_firm_share_ = mktshr_data.nth_firm_share
+        del mktshr_data
         # Generate merging-firm price and PCM data
-        _margin_data, _price_data = gen_margin_price_data(
-            _mktshr_array[:, :2],
-            _nth_firm_share,
-            _aggregate_purchase_prob,
+        margin_data, price_data = gen_margin_price_data(
+            mktshr_array_[:, :2],
+            nth_firm_share_,
+            aggregate_purchase_prob_,
             self.pcm_spec,
             self.price_spec,
             self.hsr_filing_test_type,
-            _pcm_rng_seed_seq,
-            _pr_rng_seed_seq,
+            seed_data.pcm,
+            seed_data.price,
             nthreads,
         )
-        _price_array, _hsr_filing_test = (
-            getattr(_price_data, _f) for _f in ("price_array", "hsr_filing_test")
-        )
-        _pcm_array, _mnl_test_rows = (
-            getattr(_margin_data, _f) for _f in ("pcm_array", "mnl_test_array")
-        )
-        _mnl_test_rows = _mnl_test_rows * _hsr_filing_test
-        _s_size = sample_size  # originally-specified sample size
-        if _dist_firm2_pcm == FM2Constraint.MNL:
-            _mktshr_array = _mktshr_array[_mnl_test_rows][:_s_size]
-            _pcm_array = _pcm_array[_mnl_test_rows][:_s_size]
-            _price_array = _price_array[_mnl_test_rows][:_s_size]
-            _fcounts = _fcounts[_mnl_test_rows][:_s_size]
-            _aggregate_purchase_prob = _aggregate_purchase_prob[_mnl_test_rows][
-                :_s_size
+        pcm_array_ = margin_data.pcm_array
+        price_array_ = price_data.price_array
+        if shr_sample_size > sample_size:
+            mnl_test_rows = margin_data.mnl_test_array * price_data.hsr_filing_test
+            mktshr_array_ = mktshr_array_[mnl_test_rows][:sample_size]
+            pcm_array_ = margin_data.pcm_array[mnl_test_rows][:sample_size]
+            price_array_ = price_data.price_array[mnl_test_rows][:sample_size]
+            fcounts_ = fcounts_[mnl_test_rows][:sample_size]
+            aggregate_purchase_prob_ = aggregate_purchase_prob_[mnl_test_rows][
+                :sample_size
             ]
-            _nth_firm_share = _nth_firm_share[_mnl_test_rows][:_s_size]
-        # Calculate diversion ratios
-        _divr_array = gen_divr_array(
-            _recapture_form,
-            _recapture_ratio,
-            _mktshr_array[:, :2],
-            _aggregate_purchase_prob,
-        )
-        del _mnl_test_rows, _s_size
+            nth_firm_share_ = nth_firm_share_[mnl_test_rows][:sample_size]
-        _frmshr_array = _mktshr_array[:, :2]
-        _hhi_delta = np.einsum("ij,ij->i", _frmshr_array, _frmshr_array[:, ::-1])[
-            :, None
-        ]
+            del mnl_test_rows
-        _hhi_post = (
-            _hhi_delta + np.einsum("ij,ij->i", _mktshr_array, _mktshr_array)[:, None]
+        # Calculate diversion ratios
+        divr_array = gen_divr_array(
+            self.share_spec.recapture_form,
+            self.share_spec.recapture_ratio,
+            mktshr_array_[:, :2],
+            aggregate_purchase_prob_,
         )
         return MarketSampleData(
-            _frmshr_array,
-            _pcm_array,
-            _price_array,
-            _fcounts,
-            _aggregate_purchase_prob,
-            _nth_firm_share,
-            _divr_array,
-            _hhi_post,
-            _hhi_delta,
+            mktshr_array_[:, :2],
+            pcm_array_,
+            price_array_,
+            divr_array,
+            np.einsum("ij,ij->i", mktshr_array_[:, :2], mktshr_array_[:, [1, 0]])[
+                :, None
+            ],
+            aggregate_purchase_prob_,
+            fcounts_,
+            nth_firm_share_,
+            (
+                np.einsum("ij,ij->i", mktshr_array_[:, :2], mktshr_array_[:, [1, 0]])
+                + np.einsum("ij,ij->i", mktshr_array_, mktshr_array_)
+            )[:, None],
         )
     def generate_sample(self, /) -> None:
@@ -283,7 +260,7 @@ class MarketSample:
         """
-        self.data = self._gen_market_sample(
+        self.dataset = self._gen_market_sample(
             seed_data=self.seed_data,
             sample_size=self.sample_size,
             nthreads=self.nthreads,
@@ -328,21 +305,15 @@ class MarketSample:
         """
-        _market_data_sample = self._gen_market_sample(
+        market_data_sample = self._gen_market_sample(
             sample_size=sample_size, seed_data=seed_data, nthreads=nthreads
         )
-        _invalid_array_names = (
-            ("fcounts", "choice_prob_outgd", "nth_firm_share", "hhi_post")
-            if self.share_spec.dist_type == "Uniform"
-            else ()
-        )
-        _upp_test_arrays = compute_upp_test_counts(
-            _market_data_sample, _upp_test_parms, _sim_test_regime
+        upp_test_arrays: UPPTestsCounts = compute_upp_test_counts(
+            market_data_sample, _upp_test_parms, _sim_test_regime
         )
-        return _upp_test_arrays
+        return upp_test_arrays
     def __sim_enf_cnts_ll(
         self, _enf_parm_vec: gbl.HMGThresholds, _sim_test_regime: UPPTestRegime, /
@@ -372,12 +343,10 @@ class MarketSample:
             ΔHHI and concentration zone
         """
-        _sample_sz = self.sample_size
-        _subsample_sz = 10**6
-        _iter_count = (
-            int(_sample_sz / _subsample_sz) if _subsample_sz < _sample_sz else 1
-        )
-        _thread_count = self.nthreads or cpu_count()
+        sample_sz = self.sample_size
+        subsample_sz = H5_CHUNK_SIZE
+        iter_count = (sample_sz / subsample_sz).__ceil__()  # noqa: PLC2801
+        thread_count = self.nthreads or cpu_count()
         if (
             self.share_spec.recapture_form != RECForm.OUTIN
@@ -391,51 +360,49 @@ class MarketSample:
                 )
             )
-        _rng_seed_data = [
-            SeedSequenceData(*_z)
-            for _z in (
+        rng_seed_data = list(
+            starmap(
+                SeedSequenceData,
                 zip(
                     *[
-                        _s.spawn(_iter_count) if _s else [None] * _iter_count
+                        _s.spawn(iter_count) if _s else [None] * iter_count
                         for _s in (
-                            getattr(self.seed_data, _a)
-                            for _a in self.seed_data.__dataclass_fields__
+                            getattr(self.seed_data, _a.name)
+                            for _a in self.seed_data.__attrs_attrs__
                         )
                     ],
                     strict=True,
-                )
+                ),
             )
-        ]
+        )
-        __sim_enf_cnts_kwargs: SamplingFunctionKWArgs = SamplingFunctionKWArgs({
-            "sample_size": _subsample_sz,
+        sim_enf_cnts_kwargs: SamplingFunctionKWArgs = SamplingFunctionKWArgs({
+            "sample_size": subsample_sz,
             "nthreads": self.nthreads,
         })
-        _res_list = Parallel(n_jobs=_thread_count, prefer="threads")(
+        res_list = Parallel(n_jobs=thread_count, prefer="threads")(
             delayed(self.__sim_enf_cnts)(
                 _enf_parm_vec,
                 _sim_test_regime,
-                **__sim_enf_cnts_kwargs,
+                **sim_enf_cnts_kwargs,
                 seed_data=_rng_seed_data_ch,
             )
-            for _iter_id, _rng_seed_data_ch in enumerate(_rng_seed_data)
+            for _iter_id, _rng_seed_data_ch in enumerate(rng_seed_data)
         )
-        _res_list_stacks = UPPTestsCounts(*[
-            np.stack([getattr(_j, _k) for _j in _res_list])
+        res_list_stacks = UPPTestsCounts(*[
+            np.stack([getattr(_j, _k) for _j in res_list])
             for _k in ("by_firm_count", "by_delta", "by_conczone")
         ])
         upp_test_results = UPPTestsCounts(*[
             np.column_stack((
-                (_gv := getattr(_res_list_stacks, _g))[0, :, :_h],
+                (_gv := getattr(res_list_stacks, _g.name))[0, :, :_h],
                 np.einsum("ijk->jk", _gv[:, :, _h:], dtype=np.int64),
             ))
-            for _g, _h in zip(
-                _res_list_stacks.__dataclass_fields__, [1, 1, 3], strict=True
-            )
+            for _g, _h in zip(res_list_stacks.__attrs_attrs__, [1, 1, 3], strict=True)
         ])
-        del _res_list, _res_list_stacks
+        del res_list, res_list_stacks
         return upp_test_results
@@ -462,29 +429,94 @@ class MarketSample:
         """
-        if self.data is None:
+        if self.dataset is None:
             self.enf_counts = self.__sim_enf_cnts_ll(_enf_parm_vec, _upp_test_regime)
         else:
             self.enf_counts = compute_upp_test_counts(
-                self.data, _enf_parm_vec, _upp_test_regime
+                self.dataset, _enf_parm_vec, _upp_test_regime
             )
+    def to_archive(
+        self, zip_: zipfile.ZipFile, _subdir: str = "", /, *, save_dataset: bool = False
+    ) -> None:
+        zpath = zipfile.Path(zip_, at=_subdir)
+        name_root = f"{_PKG_NAME}_market_sample"
+        with (zpath / f"{name_root}.yaml").open("w") as _yfh:
+            this_yaml.dump(self, _yfh)
+        if save_dataset:
+            if all((_ndt := self.dataset is None, _net := self.enf_counts is None)):
+                raise ValueError(
+                    "No dataset and/or enforcement counts available for saving. "
+                    "Generate some data or set save_dataset to False to poceed."
+                )
+            if not _ndt:
+                # byte_stream = io.BytesIO()
+                # with h5py.File(byte_stream, "w") as h5f:
+                #     for _a in self.dataset.__attrs_attrs__:
+                #         if all((
+                #             (_arr := getattr(self.dataset, _a.name)).any(),
+                #             not np.isnan(_arr).all(),
+                #         )):
+                #             h5f.create_dataset(_a.name, data=_arr, fletcher32=True)
+                with (zpath / f"{name_root}_dataset.h5").open("wb") as _hfh:
+                    _hfh.write(self.dataset.to_h5bin())
+            if not _net:
+                with (zpath / f"{name_root}_enf_counts.yaml").open("w") as _yfh:
+                    this_yaml.dump(self.enf_counts, _yfh)
+    def from_archive(
+        zip_: zipfile.ZipFile, _subdir: str = "", /, *, restore_dataset: bool = False
+    ) -> MarketSample:
+        zpath = zipfile.Path(zip_, at=_subdir)
+        name_root = f"{_PKG_NAME}_market_sample"
+        market_sample_ = this_yaml.load((zpath / f"{name_root}.yaml").read_text())
+        if restore_dataset:
+            if not any((
+                (_dt := (_dp := zpath / f"{name_root}_dataset.h5").is_file()),
+                (_et := (_ep := zpath / f"{name_root}_enf_counts.yaml").is_file()),
+            )):
+                raise ValueError(
+                    "Archive has no sample data to restore. "
+                    "Delete second argument, or set it False, and rerun."
+                )
+            if _dt:
+                with _dp.open("rb") as _hfh:
+                    object.__setattr__(  # noqa: PLC2801
+                        market_sample_,
+                        "dataset",
+                        # MarketSampleData(**{_a: h5f[_a][:] for _a in h5f}),
+                        MarketSampleData.from_h5f(_hfh),
+                    )
+            if _et:
+                object.__setattr__(  # noqa: PLC2801
+                    market_sample_, "enf_counts", this_yaml.load(_ep.read_text())
+                )
+        return market_sample_
     @classmethod
     def to_yaml(
-        cls, _r: yaml.representer.SafeRepresenter, _d: MarketSample
+        cls, _r: yaml.representer.RoundTripRepresenter, _d: MarketSample
     ) -> yaml.MappingNode:
-        _ret: yaml.MappingNode = _r.represent_mapping(
+        retval: yaml.MappingNode = _r.represent_mapping(
             f"!{cls.__name__}",
             {
                 _a.name: getattr(_d, _a.name)
                 for _a in _d.__attrs_attrs__
-                if _a.name not in ("data", "enf_counts")
+                if _a.name not in {"dataset", "enf_counts"}
             },
         )
-        return _ret
+        return retval
     @classmethod
     def from_yaml(
-        cls, _c: yaml.constructor.SafeConstructor, _n: yaml.MappingNode
+        cls, _c: yaml.constructor.RoundTripConstructor, _n: yaml.MappingNode
     ) -> MarketSample:
-        return cls(**_c.construct_mapping(_n))
+        return cls(**yaml_rt_mapper(_c, _n))

mergeron 2025.739290.3__py3-none-any.whl → 2025.739290.5__py3-none-any.whl

Potentially problematic release.

mergeron 2025.739290.3py3-none-any.whl → 2025.739290.5py3-none-any.whl