mergeron 2025.739290.3__py3-none-any.whl → 2025.739290.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mergeron might be problematic. Click here for more details.

@@ -5,15 +5,26 @@ Methods to generate data for analyzing merger enforcement policy.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import io
9
+ import zipfile
10
+ from itertools import starmap
8
11
  from typing import TypedDict
9
12
 
13
+ import h5py # type: ignore
10
14
  import numpy as np
11
15
  from attrs import Attribute, Converter, define, field, validators
12
16
  from joblib import Parallel, cpu_count, delayed # type: ignore
13
17
  from numpy.random import SeedSequence
14
18
  from ruamel import yaml
15
19
 
16
- from .. import NTHREADS, VERSION, RECForm, this_yaml # noqa: TID252 # noqa
20
+ from .. import ( # noqa: TID252 # noqa
21
+ _PKG_NAME,
22
+ NTHREADS,
23
+ VERSION,
24
+ RECForm,
25
+ this_yaml,
26
+ yaml_rt_mapper,
27
+ )
17
28
  from ..core import guidelines_boundaries as gbl # noqa: TID252
18
29
  from ..core.guidelines_boundaries import HMGThresholds # noqa: TID252
19
30
  from . import (
@@ -34,10 +45,12 @@ from .data_generation_functions import (
34
45
  gen_margin_price_data,
35
46
  gen_share_data,
36
47
  )
37
- from .upp_tests import compute_upp_test_counts
48
+ from .upp_tests import compute_upp_test_counts # type: ignore # has pytypes marker ...
38
49
 
39
50
  __version__ = VERSION
40
51
 
52
+ H5_CHUNK_SIZE = 10**6
53
+
41
54
 
42
55
  class SamplingFunctionKWArgs(TypedDict, total=False):
43
56
  "Keyword arguments of sampling methods defined below"
@@ -56,22 +69,15 @@ def _seed_data_conv(_v: SeedSequenceData | None, _i: MarketSample) -> SeedSequen
56
69
  if isinstance(_v, SeedSequenceData):
57
70
  return _v
58
71
 
59
- _mktshr_dist_type = _i.share_spec.dist_type
60
- _price_spec = _i.price_spec
61
-
62
- _seed_count = 2 if _mktshr_dist_type == SHRDistribution.UNI else 3
63
- _seed_count += 1 if _price_spec == PriceSpec.ZERO else 0
64
-
65
- _sseq_list = tuple(SeedSequence(pool_size=8) for _ in range(_seed_count))
66
-
67
- _mktshr_rng_seed_seq, _pcm_rng_seed_seq = _sseq_list[:2]
68
- _fcount_rng_seed_seq = (
69
- None if _mktshr_dist_type == SHRDistribution.UNI else _sseq_list[2]
70
- )
71
- _pr_rng_seed_seq = _sseq_list[-1] if _price_spec == PriceSpec.ZERO else None
72
+ _sseq = tuple(SeedSequence(pool_size=8) for _ in range(4))
73
+ _sdtt = _i.share_spec.dist_type == SHRDistribution.UNI
74
+ _pst = _i.price_spec == PriceSpec.RNG
72
75
 
73
76
  return SeedSequenceData(
74
- _mktshr_rng_seed_seq, _pcm_rng_seed_seq, _fcount_rng_seed_seq, _pr_rng_seed_seq
77
+ share=_sseq[0],
78
+ pcm=_sseq[1],
79
+ fcounts=(None if _sdtt else _sseq[2]),
80
+ price=(None if not _pst else (_sseq[2] if _sdtt else _sseq[3])),
75
81
  )
76
82
 
77
83
 
@@ -141,7 +147,7 @@ class MarketSample:
141
147
  "Set seed_data.fcounts to None and retry."
142
148
  )
143
149
 
144
- if _i.price_spec != PriceSpec.ZERO and _v.price is not None:
150
+ if _i.price_spec != PriceSpec.RNG and _v.price is not None:
145
151
  raise ValueError(
146
152
  "Attribute, seed_data.price is ignored as irrelevant unless "
147
153
  "prices are asymmetric and uncorrelated and price-cost margins "
@@ -151,12 +157,12 @@ class MarketSample:
151
157
  nthreads: int = field(default=NTHREADS, validator=validators.instance_of(int))
152
158
  """number of parallel threads to use"""
153
159
 
154
- data: MarketSampleData | None = field(default=None)
160
+ dataset: MarketSampleData | None = field(default=None, init=False)
155
161
 
156
- enf_counts: UPPTestsCounts | None = field(default=None)
162
+ enf_counts: UPPTestsCounts | None = field(default=None, init=False)
157
163
 
158
164
  def _gen_market_sample(
159
- self, /, *, sample_size: int, seed_data: SeedSequenceData | None, nthreads: int
165
+ self, /, *, sample_size: int, seed_data: SeedSequenceData, nthreads: int
160
166
  ) -> MarketSampleData:
161
167
  """
162
168
  Generate share, diversion ratio, price, and margin data for MarketSpec.
@@ -170,108 +176,81 @@ class MarketSample:
170
176
 
171
177
  """
172
178
 
173
- _recapture_form = self.share_spec.recapture_form
174
- _recapture_ratio = self.share_spec.recapture_ratio
175
- _dist_type_mktshr = self.share_spec.dist_type
176
- _dist_firm2_pcm = self.pcm_spec.firm2_pcm_constraint
177
- _hsr_filing_test_type = self.hsr_filing_test_type
178
-
179
- _seed_data = seed_data or self.seed_data
180
- (
181
- _mktshr_rng_seed_seq,
182
- _pcm_rng_seed_seq,
183
- _fcount_rng_seed_seq,
184
- _pr_rng_seed_seq,
185
- ) = (getattr(_seed_data, _a) for _a in _seed_data.__dataclass_fields__)
186
- _shr_sample_size = 1.0 * (sample_size or self.sample_size)
187
-
188
179
  # Scale up sample size to offset discards based on specified criteria
189
- _shr_sample_size *= _hsr_filing_test_type
190
- if _dist_firm2_pcm == FM2Constraint.MNL:
191
- _shr_sample_size *= SSZConstant.MNL_DEP
192
- _shr_sample_size = int(_shr_sample_size)
180
+ shr_sample_size = sample_size * self.hsr_filing_test_type
181
+ shr_sample_size *= (
182
+ SSZConstant.MNL_DEP
183
+ if self.pcm_spec.firm2_pcm_constraint == FM2Constraint.MNL
184
+ else 1
185
+ )
186
+ shr_sample_size = int(shr_sample_size)
193
187
 
194
188
  # Generate share data
195
- _mktshr_data = gen_share_data(
196
- _shr_sample_size,
189
+ mktshr_data = gen_share_data(
190
+ shr_sample_size,
197
191
  self.share_spec,
198
- _fcount_rng_seed_seq,
199
- _mktshr_rng_seed_seq,
200
- nthreads or self.nthreads,
201
- )
202
-
203
- _mktshr_array, _fcounts, _aggregate_purchase_prob, _nth_firm_share = (
204
- getattr(_mktshr_data, _f)
205
- for _f in (
206
- "mktshr_array",
207
- "fcounts",
208
- "aggregate_purchase_prob",
209
- "nth_firm_share",
210
- )
192
+ seed_data.fcounts,
193
+ seed_data.share,
194
+ nthreads,
211
195
  )
196
+ mktshr_array_ = mktshr_data.mktshr_array
197
+ fcounts_ = mktshr_data.fcounts
198
+ aggregate_purchase_prob_ = mktshr_data.aggregate_purchase_prob
199
+ nth_firm_share_ = mktshr_data.nth_firm_share
200
+ del mktshr_data
212
201
 
213
202
  # Generate merging-firm price and PCM data
214
- _margin_data, _price_data = gen_margin_price_data(
215
- _mktshr_array[:, :2],
216
- _nth_firm_share,
217
- _aggregate_purchase_prob,
203
+ margin_data, price_data = gen_margin_price_data(
204
+ mktshr_array_[:, :2],
205
+ nth_firm_share_,
206
+ aggregate_purchase_prob_,
218
207
  self.pcm_spec,
219
208
  self.price_spec,
220
209
  self.hsr_filing_test_type,
221
- _pcm_rng_seed_seq,
222
- _pr_rng_seed_seq,
210
+ seed_data.pcm,
211
+ seed_data.price,
223
212
  nthreads,
224
213
  )
225
-
226
- _price_array, _hsr_filing_test = (
227
- getattr(_price_data, _f) for _f in ("price_array", "hsr_filing_test")
228
- )
229
-
230
- _pcm_array, _mnl_test_rows = (
231
- getattr(_margin_data, _f) for _f in ("pcm_array", "mnl_test_array")
232
- )
233
-
234
- _mnl_test_rows = _mnl_test_rows * _hsr_filing_test
235
- _s_size = sample_size # originally-specified sample size
236
- if _dist_firm2_pcm == FM2Constraint.MNL:
237
- _mktshr_array = _mktshr_array[_mnl_test_rows][:_s_size]
238
- _pcm_array = _pcm_array[_mnl_test_rows][:_s_size]
239
- _price_array = _price_array[_mnl_test_rows][:_s_size]
240
- _fcounts = _fcounts[_mnl_test_rows][:_s_size]
241
- _aggregate_purchase_prob = _aggregate_purchase_prob[_mnl_test_rows][
242
- :_s_size
214
+ pcm_array_ = margin_data.pcm_array
215
+ price_array_ = price_data.price_array
216
+
217
+ if shr_sample_size > sample_size:
218
+ mnl_test_rows = margin_data.mnl_test_array * price_data.hsr_filing_test
219
+
220
+ mktshr_array_ = mktshr_array_[mnl_test_rows][:sample_size]
221
+ pcm_array_ = margin_data.pcm_array[mnl_test_rows][:sample_size]
222
+ price_array_ = price_data.price_array[mnl_test_rows][:sample_size]
223
+ fcounts_ = fcounts_[mnl_test_rows][:sample_size]
224
+ aggregate_purchase_prob_ = aggregate_purchase_prob_[mnl_test_rows][
225
+ :sample_size
243
226
  ]
244
- _nth_firm_share = _nth_firm_share[_mnl_test_rows][:_s_size]
245
-
246
- # Calculate diversion ratios
247
- _divr_array = gen_divr_array(
248
- _recapture_form,
249
- _recapture_ratio,
250
- _mktshr_array[:, :2],
251
- _aggregate_purchase_prob,
252
- )
253
-
254
- del _mnl_test_rows, _s_size
227
+ nth_firm_share_ = nth_firm_share_[mnl_test_rows][:sample_size]
255
228
 
256
- _frmshr_array = _mktshr_array[:, :2]
257
- _hhi_delta = np.einsum("ij,ij->i", _frmshr_array, _frmshr_array[:, ::-1])[
258
- :, None
259
- ]
229
+ del mnl_test_rows
260
230
 
261
- _hhi_post = (
262
- _hhi_delta + np.einsum("ij,ij->i", _mktshr_array, _mktshr_array)[:, None]
231
+ # Calculate diversion ratios
232
+ divr_array = gen_divr_array(
233
+ self.share_spec.recapture_form,
234
+ self.share_spec.recapture_ratio,
235
+ mktshr_array_[:, :2],
236
+ aggregate_purchase_prob_,
263
237
  )
264
238
 
265
239
  return MarketSampleData(
266
- _frmshr_array,
267
- _pcm_array,
268
- _price_array,
269
- _fcounts,
270
- _aggregate_purchase_prob,
271
- _nth_firm_share,
272
- _divr_array,
273
- _hhi_post,
274
- _hhi_delta,
240
+ mktshr_array_[:, :2],
241
+ pcm_array_,
242
+ price_array_,
243
+ divr_array,
244
+ np.einsum("ij,ij->i", mktshr_array_[:, :2], mktshr_array_[:, [1, 0]])[
245
+ :, None
246
+ ],
247
+ aggregate_purchase_prob_,
248
+ fcounts_,
249
+ nth_firm_share_,
250
+ (
251
+ np.einsum("ij,ij->i", mktshr_array_[:, :2], mktshr_array_[:, [1, 0]])
252
+ + np.einsum("ij,ij->i", mktshr_array_, mktshr_array_)
253
+ )[:, None],
275
254
  )
276
255
 
277
256
  def generate_sample(self, /) -> None:
@@ -283,7 +262,7 @@ class MarketSample:
283
262
 
284
263
  """
285
264
 
286
- self.data = self._gen_market_sample(
265
+ self.dataset = self._gen_market_sample(
287
266
  seed_data=self.seed_data,
288
267
  sample_size=self.sample_size,
289
268
  nthreads=self.nthreads,
@@ -328,21 +307,15 @@ class MarketSample:
328
307
 
329
308
  """
330
309
 
331
- _market_data_sample = self._gen_market_sample(
310
+ market_data_sample = self._gen_market_sample(
332
311
  sample_size=sample_size, seed_data=seed_data, nthreads=nthreads
333
312
  )
334
313
 
335
- _invalid_array_names = (
336
- ("fcounts", "choice_prob_outgd", "nth_firm_share", "hhi_post")
337
- if self.share_spec.dist_type == "Uniform"
338
- else ()
339
- )
340
-
341
- _upp_test_arrays = compute_upp_test_counts(
342
- _market_data_sample, _upp_test_parms, _sim_test_regime
314
+ upp_test_arrays: UPPTestsCounts = compute_upp_test_counts(
315
+ market_data_sample, _upp_test_parms, _sim_test_regime
343
316
  )
344
317
 
345
- return _upp_test_arrays
318
+ return upp_test_arrays
346
319
 
347
320
  def __sim_enf_cnts_ll(
348
321
  self, _enf_parm_vec: gbl.HMGThresholds, _sim_test_regime: UPPTestRegime, /
@@ -372,12 +345,10 @@ class MarketSample:
372
345
  ΔHHI and concentration zone
373
346
 
374
347
  """
375
- _sample_sz = self.sample_size
376
- _subsample_sz = 10**6
377
- _iter_count = (
378
- int(_sample_sz / _subsample_sz) if _subsample_sz < _sample_sz else 1
379
- )
380
- _thread_count = self.nthreads or cpu_count()
348
+ sample_sz = self.sample_size
349
+ subsample_sz = H5_CHUNK_SIZE
350
+ iter_count = (sample_sz / subsample_sz).__ceil__() # noqa: PLC2801
351
+ thread_count = self.nthreads or cpu_count()
381
352
 
382
353
  if (
383
354
  self.share_spec.recapture_form != RECForm.OUTIN
@@ -391,51 +362,49 @@ class MarketSample:
391
362
  )
392
363
  )
393
364
 
394
- _rng_seed_data = [
395
- SeedSequenceData(*_z)
396
- for _z in (
365
+ rng_seed_data = list(
366
+ starmap(
367
+ SeedSequenceData,
397
368
  zip(
398
369
  *[
399
- _s.spawn(_iter_count) if _s else [None] * _iter_count
370
+ _s.spawn(iter_count) if _s else [None] * iter_count
400
371
  for _s in (
401
- getattr(self.seed_data, _a)
402
- for _a in self.seed_data.__dataclass_fields__
372
+ getattr(self.seed_data, _a.name)
373
+ for _a in self.seed_data.__attrs_attrs__
403
374
  )
404
375
  ],
405
376
  strict=True,
406
- )
377
+ ),
407
378
  )
408
- ]
379
+ )
409
380
 
410
- __sim_enf_cnts_kwargs: SamplingFunctionKWArgs = SamplingFunctionKWArgs({
411
- "sample_size": _subsample_sz,
381
+ sim_enf_cnts_kwargs: SamplingFunctionKWArgs = SamplingFunctionKWArgs({
382
+ "sample_size": subsample_sz,
412
383
  "nthreads": self.nthreads,
413
384
  })
414
385
 
415
- _res_list = Parallel(n_jobs=_thread_count, prefer="threads")(
386
+ res_list = Parallel(n_jobs=thread_count, prefer="threads")(
416
387
  delayed(self.__sim_enf_cnts)(
417
388
  _enf_parm_vec,
418
389
  _sim_test_regime,
419
- **__sim_enf_cnts_kwargs,
390
+ **sim_enf_cnts_kwargs,
420
391
  seed_data=_rng_seed_data_ch,
421
392
  )
422
- for _iter_id, _rng_seed_data_ch in enumerate(_rng_seed_data)
393
+ for _iter_id, _rng_seed_data_ch in enumerate(rng_seed_data)
423
394
  )
424
395
 
425
- _res_list_stacks = UPPTestsCounts(*[
426
- np.stack([getattr(_j, _k) for _j in _res_list])
396
+ res_list_stacks = UPPTestsCounts(*[
397
+ np.stack([getattr(_j, _k) for _j in res_list])
427
398
  for _k in ("by_firm_count", "by_delta", "by_conczone")
428
399
  ])
429
400
  upp_test_results = UPPTestsCounts(*[
430
401
  np.column_stack((
431
- (_gv := getattr(_res_list_stacks, _g))[0, :, :_h],
402
+ (_gv := getattr(res_list_stacks, _g.name))[0, :, :_h],
432
403
  np.einsum("ijk->jk", _gv[:, :, _h:], dtype=np.int64),
433
404
  ))
434
- for _g, _h in zip(
435
- _res_list_stacks.__dataclass_fields__, [1, 1, 3], strict=True
436
- )
405
+ for _g, _h in zip(res_list_stacks.__attrs_attrs__, [1, 1, 3], strict=True)
437
406
  ])
438
- del _res_list, _res_list_stacks
407
+ del res_list, res_list_stacks
439
408
 
440
409
  return upp_test_results
441
410
 
@@ -462,29 +431,94 @@ class MarketSample:
462
431
 
463
432
  """
464
433
 
465
- if self.data is None:
434
+ if self.dataset is None:
466
435
  self.enf_counts = self.__sim_enf_cnts_ll(_enf_parm_vec, _upp_test_regime)
467
436
  else:
468
437
  self.enf_counts = compute_upp_test_counts(
469
- self.data, _enf_parm_vec, _upp_test_regime
438
+ self.dataset, _enf_parm_vec, _upp_test_regime
470
439
  )
471
440
 
441
+ def to_archive(
442
+ self, zip_: zipfile.ZipFile, _subdir: str = "", /, *, save_dataset: bool = False
443
+ ) -> None:
444
+ zpath = zipfile.Path(zip_, at=_subdir)
445
+ name_root = f"{_PKG_NAME}_market_sample"
446
+
447
+ with (zpath / f"{name_root}.yaml").open("w") as _yfh:
448
+ this_yaml.dump(self, _yfh)
449
+
450
+ if save_dataset:
451
+ if all((_dt := self.dataset is None, _et := self.enf_counts is None)):
452
+ raise ValueError(
453
+ "No dataset and/or enforcement counts available for saving. "
454
+ "Generate some data or set save_dataset to False to poceed."
455
+ )
456
+
457
+ if not _dt:
458
+ byte_stream = io.BytesIO()
459
+ with h5py.File(byte_stream, "w") as h5f:
460
+ for _a in self.dataset.__attrs_attrs__:
461
+ if all((
462
+ (_arr := getattr(self.dataset, _a.name)).any(),
463
+ not np.isnan(_arr).all(),
464
+ )):
465
+ h5f.create_dataset(_a.name, data=_arr, fletcher32=True)
466
+
467
+ with (zpath / f"{name_root}_dataset.h5").open("wb") as _hfh:
468
+ _hfh.write(byte_stream.getvalue())
469
+
470
+ if not _et:
471
+ with (zpath / f"{name_root}_enf_counts.yaml").open("w") as _yfh:
472
+ this_yaml.dump(self.enf_counts, _yfh)
473
+
474
+ def from_archive(
475
+ zip_: zipfile.ZipFile, _subdir: str = "", /, *, restore_dataset: bool = False
476
+ ) -> MarketSample:
477
+ zpath = zipfile.Path(zip_, at=_subdir)
478
+ name_root = f"{_PKG_NAME}_market_sample"
479
+
480
+ market_sample_ = this_yaml.load((zpath / f"{name_root}.yaml").read_text())
481
+
482
+ if restore_dataset:
483
+ if not any((
484
+ (_dt := (_dp := zpath / f"{name_root}_dataset.h5").is_file()),
485
+ (_et := (_ep := zpath / f"{name_root}_enf_counts.yaml").is_file()),
486
+ )):
487
+ raise ValueError(
488
+ "Archive has no sample data to restore. "
489
+ "Delete second argument, or set it False, and rerun."
490
+ )
491
+
492
+ if _dt:
493
+ with _dp.open("rb") as _hfh:
494
+ h5f = h5py.File(_hfh)
495
+ object.__setattr__( # noqa: PLC2801
496
+ market_sample_,
497
+ "dataset",
498
+ MarketSampleData(**{_a: h5f[_a][:] for _a in h5f}),
499
+ )
500
+ if _et:
501
+ object.__setattr__( # noqa: PLC2801
502
+ market_sample_, "enf_counts", this_yaml.load(_ep.read_text())
503
+ )
504
+ return market_sample_
505
+
472
506
  @classmethod
473
507
  def to_yaml(
474
- cls, _r: yaml.representer.SafeRepresenter, _d: MarketSample
508
+ cls, _r: yaml.representer.RoundTripRepresenter, _d: MarketSample
475
509
  ) -> yaml.MappingNode:
476
- _ret: yaml.MappingNode = _r.represent_mapping(
510
+ retval: yaml.MappingNode = _r.represent_mapping(
477
511
  f"!{cls.__name__}",
478
512
  {
479
513
  _a.name: getattr(_d, _a.name)
480
514
  for _a in _d.__attrs_attrs__
481
- if _a.name not in ("data", "enf_counts")
515
+ if _a.name not in {"dataset", "enf_counts"}
482
516
  },
483
517
  )
484
- return _ret
518
+ return retval
485
519
 
486
520
  @classmethod
487
521
  def from_yaml(
488
- cls, _c: yaml.constructor.SafeConstructor, _n: yaml.MappingNode
522
+ cls, _c: yaml.constructor.RoundTripConstructor, _n: yaml.MappingNode
489
523
  ) -> MarketSample:
490
- return cls(**_c.construct_mapping(_n))
524
+ return cls(**yaml_rt_mapper(_c, _n))