mergeron 2025.739290.3__py3-none-any.whl → 2025.739290.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mergeron might be problematic. Click here for more details.

@@ -5,6 +5,8 @@ Methods to generate data for analyzing merger enforcement policy.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import zipfile
9
+ from itertools import starmap
8
10
  from typing import TypedDict
9
11
 
10
12
  import numpy as np
@@ -13,7 +15,14 @@ from joblib import Parallel, cpu_count, delayed # type: ignore
13
15
  from numpy.random import SeedSequence
14
16
  from ruamel import yaml
15
17
 
16
- from .. import NTHREADS, VERSION, RECForm, this_yaml # noqa: TID252 # noqa
18
+ from .. import ( # noqa: TID252 # noqa
19
+ _PKG_NAME,
20
+ NTHREADS,
21
+ VERSION,
22
+ RECForm,
23
+ this_yaml,
24
+ yaml_rt_mapper,
25
+ )
17
26
  from ..core import guidelines_boundaries as gbl # noqa: TID252
18
27
  from ..core.guidelines_boundaries import HMGThresholds # noqa: TID252
19
28
  from . import (
@@ -34,10 +43,12 @@ from .data_generation_functions import (
34
43
  gen_margin_price_data,
35
44
  gen_share_data,
36
45
  )
37
- from .upp_tests import compute_upp_test_counts
46
+ from .upp_tests import compute_upp_test_counts # type: ignore # has pytypes marker ...
38
47
 
39
48
  __version__ = VERSION
40
49
 
50
+ H5_CHUNK_SIZE = 10**6
51
+
41
52
 
42
53
  class SamplingFunctionKWArgs(TypedDict, total=False):
43
54
  "Keyword arguments of sampling methods defined below"
@@ -56,22 +67,15 @@ def _seed_data_conv(_v: SeedSequenceData | None, _i: MarketSample) -> SeedSequen
56
67
  if isinstance(_v, SeedSequenceData):
57
68
  return _v
58
69
 
59
- _mktshr_dist_type = _i.share_spec.dist_type
60
- _price_spec = _i.price_spec
61
-
62
- _seed_count = 2 if _mktshr_dist_type == SHRDistribution.UNI else 3
63
- _seed_count += 1 if _price_spec == PriceSpec.ZERO else 0
64
-
65
- _sseq_list = tuple(SeedSequence(pool_size=8) for _ in range(_seed_count))
66
-
67
- _mktshr_rng_seed_seq, _pcm_rng_seed_seq = _sseq_list[:2]
68
- _fcount_rng_seed_seq = (
69
- None if _mktshr_dist_type == SHRDistribution.UNI else _sseq_list[2]
70
- )
71
- _pr_rng_seed_seq = _sseq_list[-1] if _price_spec == PriceSpec.ZERO else None
70
+ _sseq = tuple(SeedSequence(pool_size=8) for _ in range(4))
71
+ _sdtt = _i.share_spec.dist_type == SHRDistribution.UNI
72
+ _pst = _i.price_spec == PriceSpec.RNG
72
73
 
73
74
  return SeedSequenceData(
74
- _mktshr_rng_seed_seq, _pcm_rng_seed_seq, _fcount_rng_seed_seq, _pr_rng_seed_seq
75
+ share=_sseq[0],
76
+ pcm=_sseq[1],
77
+ fcounts=(None if _sdtt else _sseq[2]),
78
+ price=(None if not _pst else (_sseq[2] if _sdtt else _sseq[3])),
75
79
  )
76
80
 
77
81
 
@@ -141,7 +145,7 @@ class MarketSample:
141
145
  "Set seed_data.fcounts to None and retry."
142
146
  )
143
147
 
144
- if _i.price_spec != PriceSpec.ZERO and _v.price is not None:
148
+ if _i.price_spec != PriceSpec.RNG and _v.price is not None:
145
149
  raise ValueError(
146
150
  "Attribute, seed_data.price is ignored as irrelevant unless "
147
151
  "prices are asymmetric and uncorrelated and price-cost margins "
@@ -151,12 +155,12 @@ class MarketSample:
151
155
  nthreads: int = field(default=NTHREADS, validator=validators.instance_of(int))
152
156
  """number of parallel threads to use"""
153
157
 
154
- data: MarketSampleData | None = field(default=None)
158
+ dataset: MarketSampleData | None = field(default=None, init=False)
155
159
 
156
- enf_counts: UPPTestsCounts | None = field(default=None)
160
+ enf_counts: UPPTestsCounts | None = field(default=None, init=False)
157
161
 
158
162
  def _gen_market_sample(
159
- self, /, *, sample_size: int, seed_data: SeedSequenceData | None, nthreads: int
163
+ self, /, *, sample_size: int, seed_data: SeedSequenceData, nthreads: int
160
164
  ) -> MarketSampleData:
161
165
  """
162
166
  Generate share, diversion ratio, price, and margin data for MarketSpec.
@@ -170,108 +174,81 @@ class MarketSample:
170
174
 
171
175
  """
172
176
 
173
- _recapture_form = self.share_spec.recapture_form
174
- _recapture_ratio = self.share_spec.recapture_ratio
175
- _dist_type_mktshr = self.share_spec.dist_type
176
- _dist_firm2_pcm = self.pcm_spec.firm2_pcm_constraint
177
- _hsr_filing_test_type = self.hsr_filing_test_type
178
-
179
- _seed_data = seed_data or self.seed_data
180
- (
181
- _mktshr_rng_seed_seq,
182
- _pcm_rng_seed_seq,
183
- _fcount_rng_seed_seq,
184
- _pr_rng_seed_seq,
185
- ) = (getattr(_seed_data, _a) for _a in _seed_data.__dataclass_fields__)
186
- _shr_sample_size = 1.0 * (sample_size or self.sample_size)
187
-
188
177
  # Scale up sample size to offset discards based on specified criteria
189
- _shr_sample_size *= _hsr_filing_test_type
190
- if _dist_firm2_pcm == FM2Constraint.MNL:
191
- _shr_sample_size *= SSZConstant.MNL_DEP
192
- _shr_sample_size = int(_shr_sample_size)
178
+ shr_sample_size = sample_size * self.hsr_filing_test_type
179
+ shr_sample_size *= (
180
+ SSZConstant.MNL_DEP
181
+ if self.pcm_spec.firm2_pcm_constraint == FM2Constraint.MNL
182
+ else 1
183
+ )
184
+ shr_sample_size = int(shr_sample_size)
193
185
 
194
186
  # Generate share data
195
- _mktshr_data = gen_share_data(
196
- _shr_sample_size,
187
+ mktshr_data = gen_share_data(
188
+ shr_sample_size,
197
189
  self.share_spec,
198
- _fcount_rng_seed_seq,
199
- _mktshr_rng_seed_seq,
200
- nthreads or self.nthreads,
201
- )
202
-
203
- _mktshr_array, _fcounts, _aggregate_purchase_prob, _nth_firm_share = (
204
- getattr(_mktshr_data, _f)
205
- for _f in (
206
- "mktshr_array",
207
- "fcounts",
208
- "aggregate_purchase_prob",
209
- "nth_firm_share",
210
- )
190
+ seed_data.fcounts,
191
+ seed_data.share,
192
+ nthreads,
211
193
  )
194
+ mktshr_array_ = mktshr_data.mktshr_array
195
+ fcounts_ = mktshr_data.fcounts
196
+ aggregate_purchase_prob_ = mktshr_data.aggregate_purchase_prob
197
+ nth_firm_share_ = mktshr_data.nth_firm_share
198
+ del mktshr_data
212
199
 
213
200
  # Generate merging-firm price and PCM data
214
- _margin_data, _price_data = gen_margin_price_data(
215
- _mktshr_array[:, :2],
216
- _nth_firm_share,
217
- _aggregate_purchase_prob,
201
+ margin_data, price_data = gen_margin_price_data(
202
+ mktshr_array_[:, :2],
203
+ nth_firm_share_,
204
+ aggregate_purchase_prob_,
218
205
  self.pcm_spec,
219
206
  self.price_spec,
220
207
  self.hsr_filing_test_type,
221
- _pcm_rng_seed_seq,
222
- _pr_rng_seed_seq,
208
+ seed_data.pcm,
209
+ seed_data.price,
223
210
  nthreads,
224
211
  )
225
-
226
- _price_array, _hsr_filing_test = (
227
- getattr(_price_data, _f) for _f in ("price_array", "hsr_filing_test")
228
- )
229
-
230
- _pcm_array, _mnl_test_rows = (
231
- getattr(_margin_data, _f) for _f in ("pcm_array", "mnl_test_array")
232
- )
233
-
234
- _mnl_test_rows = _mnl_test_rows * _hsr_filing_test
235
- _s_size = sample_size # originally-specified sample size
236
- if _dist_firm2_pcm == FM2Constraint.MNL:
237
- _mktshr_array = _mktshr_array[_mnl_test_rows][:_s_size]
238
- _pcm_array = _pcm_array[_mnl_test_rows][:_s_size]
239
- _price_array = _price_array[_mnl_test_rows][:_s_size]
240
- _fcounts = _fcounts[_mnl_test_rows][:_s_size]
241
- _aggregate_purchase_prob = _aggregate_purchase_prob[_mnl_test_rows][
242
- :_s_size
212
+ pcm_array_ = margin_data.pcm_array
213
+ price_array_ = price_data.price_array
214
+
215
+ if shr_sample_size > sample_size:
216
+ mnl_test_rows = margin_data.mnl_test_array * price_data.hsr_filing_test
217
+
218
+ mktshr_array_ = mktshr_array_[mnl_test_rows][:sample_size]
219
+ pcm_array_ = margin_data.pcm_array[mnl_test_rows][:sample_size]
220
+ price_array_ = price_data.price_array[mnl_test_rows][:sample_size]
221
+ fcounts_ = fcounts_[mnl_test_rows][:sample_size]
222
+ aggregate_purchase_prob_ = aggregate_purchase_prob_[mnl_test_rows][
223
+ :sample_size
243
224
  ]
244
- _nth_firm_share = _nth_firm_share[_mnl_test_rows][:_s_size]
245
-
246
- # Calculate diversion ratios
247
- _divr_array = gen_divr_array(
248
- _recapture_form,
249
- _recapture_ratio,
250
- _mktshr_array[:, :2],
251
- _aggregate_purchase_prob,
252
- )
253
-
254
- del _mnl_test_rows, _s_size
225
+ nth_firm_share_ = nth_firm_share_[mnl_test_rows][:sample_size]
255
226
 
256
- _frmshr_array = _mktshr_array[:, :2]
257
- _hhi_delta = np.einsum("ij,ij->i", _frmshr_array, _frmshr_array[:, ::-1])[
258
- :, None
259
- ]
227
+ del mnl_test_rows
260
228
 
261
- _hhi_post = (
262
- _hhi_delta + np.einsum("ij,ij->i", _mktshr_array, _mktshr_array)[:, None]
229
+ # Calculate diversion ratios
230
+ divr_array = gen_divr_array(
231
+ self.share_spec.recapture_form,
232
+ self.share_spec.recapture_ratio,
233
+ mktshr_array_[:, :2],
234
+ aggregate_purchase_prob_,
263
235
  )
264
236
 
265
237
  return MarketSampleData(
266
- _frmshr_array,
267
- _pcm_array,
268
- _price_array,
269
- _fcounts,
270
- _aggregate_purchase_prob,
271
- _nth_firm_share,
272
- _divr_array,
273
- _hhi_post,
274
- _hhi_delta,
238
+ mktshr_array_[:, :2],
239
+ pcm_array_,
240
+ price_array_,
241
+ divr_array,
242
+ np.einsum("ij,ij->i", mktshr_array_[:, :2], mktshr_array_[:, [1, 0]])[
243
+ :, None
244
+ ],
245
+ aggregate_purchase_prob_,
246
+ fcounts_,
247
+ nth_firm_share_,
248
+ (
249
+ np.einsum("ij,ij->i", mktshr_array_[:, :2], mktshr_array_[:, [1, 0]])
250
+ + np.einsum("ij,ij->i", mktshr_array_, mktshr_array_)
251
+ )[:, None],
275
252
  )
276
253
 
277
254
  def generate_sample(self, /) -> None:
@@ -283,7 +260,7 @@ class MarketSample:
283
260
 
284
261
  """
285
262
 
286
- self.data = self._gen_market_sample(
263
+ self.dataset = self._gen_market_sample(
287
264
  seed_data=self.seed_data,
288
265
  sample_size=self.sample_size,
289
266
  nthreads=self.nthreads,
@@ -328,21 +305,15 @@ class MarketSample:
328
305
 
329
306
  """
330
307
 
331
- _market_data_sample = self._gen_market_sample(
308
+ market_data_sample = self._gen_market_sample(
332
309
  sample_size=sample_size, seed_data=seed_data, nthreads=nthreads
333
310
  )
334
311
 
335
- _invalid_array_names = (
336
- ("fcounts", "choice_prob_outgd", "nth_firm_share", "hhi_post")
337
- if self.share_spec.dist_type == "Uniform"
338
- else ()
339
- )
340
-
341
- _upp_test_arrays = compute_upp_test_counts(
342
- _market_data_sample, _upp_test_parms, _sim_test_regime
312
+ upp_test_arrays: UPPTestsCounts = compute_upp_test_counts(
313
+ market_data_sample, _upp_test_parms, _sim_test_regime
343
314
  )
344
315
 
345
- return _upp_test_arrays
316
+ return upp_test_arrays
346
317
 
347
318
  def __sim_enf_cnts_ll(
348
319
  self, _enf_parm_vec: gbl.HMGThresholds, _sim_test_regime: UPPTestRegime, /
@@ -372,12 +343,10 @@ class MarketSample:
372
343
  ΔHHI and concentration zone
373
344
 
374
345
  """
375
- _sample_sz = self.sample_size
376
- _subsample_sz = 10**6
377
- _iter_count = (
378
- int(_sample_sz / _subsample_sz) if _subsample_sz < _sample_sz else 1
379
- )
380
- _thread_count = self.nthreads or cpu_count()
346
+ sample_sz = self.sample_size
347
+ subsample_sz = H5_CHUNK_SIZE
348
+ iter_count = (sample_sz / subsample_sz).__ceil__() # noqa: PLC2801
349
+ thread_count = self.nthreads or cpu_count()
381
350
 
382
351
  if (
383
352
  self.share_spec.recapture_form != RECForm.OUTIN
@@ -391,51 +360,49 @@ class MarketSample:
391
360
  )
392
361
  )
393
362
 
394
- _rng_seed_data = [
395
- SeedSequenceData(*_z)
396
- for _z in (
363
+ rng_seed_data = list(
364
+ starmap(
365
+ SeedSequenceData,
397
366
  zip(
398
367
  *[
399
- _s.spawn(_iter_count) if _s else [None] * _iter_count
368
+ _s.spawn(iter_count) if _s else [None] * iter_count
400
369
  for _s in (
401
- getattr(self.seed_data, _a)
402
- for _a in self.seed_data.__dataclass_fields__
370
+ getattr(self.seed_data, _a.name)
371
+ for _a in self.seed_data.__attrs_attrs__
403
372
  )
404
373
  ],
405
374
  strict=True,
406
- )
375
+ ),
407
376
  )
408
- ]
377
+ )
409
378
 
410
- __sim_enf_cnts_kwargs: SamplingFunctionKWArgs = SamplingFunctionKWArgs({
411
- "sample_size": _subsample_sz,
379
+ sim_enf_cnts_kwargs: SamplingFunctionKWArgs = SamplingFunctionKWArgs({
380
+ "sample_size": subsample_sz,
412
381
  "nthreads": self.nthreads,
413
382
  })
414
383
 
415
- _res_list = Parallel(n_jobs=_thread_count, prefer="threads")(
384
+ res_list = Parallel(n_jobs=thread_count, prefer="threads")(
416
385
  delayed(self.__sim_enf_cnts)(
417
386
  _enf_parm_vec,
418
387
  _sim_test_regime,
419
- **__sim_enf_cnts_kwargs,
388
+ **sim_enf_cnts_kwargs,
420
389
  seed_data=_rng_seed_data_ch,
421
390
  )
422
- for _iter_id, _rng_seed_data_ch in enumerate(_rng_seed_data)
391
+ for _iter_id, _rng_seed_data_ch in enumerate(rng_seed_data)
423
392
  )
424
393
 
425
- _res_list_stacks = UPPTestsCounts(*[
426
- np.stack([getattr(_j, _k) for _j in _res_list])
394
+ res_list_stacks = UPPTestsCounts(*[
395
+ np.stack([getattr(_j, _k) for _j in res_list])
427
396
  for _k in ("by_firm_count", "by_delta", "by_conczone")
428
397
  ])
429
398
  upp_test_results = UPPTestsCounts(*[
430
399
  np.column_stack((
431
- (_gv := getattr(_res_list_stacks, _g))[0, :, :_h],
400
+ (_gv := getattr(res_list_stacks, _g.name))[0, :, :_h],
432
401
  np.einsum("ijk->jk", _gv[:, :, _h:], dtype=np.int64),
433
402
  ))
434
- for _g, _h in zip(
435
- _res_list_stacks.__dataclass_fields__, [1, 1, 3], strict=True
436
- )
403
+ for _g, _h in zip(res_list_stacks.__attrs_attrs__, [1, 1, 3], strict=True)
437
404
  ])
438
- del _res_list, _res_list_stacks
405
+ del res_list, res_list_stacks
439
406
 
440
407
  return upp_test_results
441
408
 
@@ -462,29 +429,94 @@ class MarketSample:
462
429
 
463
430
  """
464
431
 
465
- if self.data is None:
432
+ if self.dataset is None:
466
433
  self.enf_counts = self.__sim_enf_cnts_ll(_enf_parm_vec, _upp_test_regime)
467
434
  else:
468
435
  self.enf_counts = compute_upp_test_counts(
469
- self.data, _enf_parm_vec, _upp_test_regime
436
+ self.dataset, _enf_parm_vec, _upp_test_regime
470
437
  )
471
438
 
439
+ def to_archive(
440
+ self, zip_: zipfile.ZipFile, _subdir: str = "", /, *, save_dataset: bool = False
441
+ ) -> None:
442
+ zpath = zipfile.Path(zip_, at=_subdir)
443
+ name_root = f"{_PKG_NAME}_market_sample"
444
+
445
+ with (zpath / f"{name_root}.yaml").open("w") as _yfh:
446
+ this_yaml.dump(self, _yfh)
447
+
448
+ if save_dataset:
449
+ if all((_ndt := self.dataset is None, _net := self.enf_counts is None)):
450
+ raise ValueError(
451
+ "No dataset and/or enforcement counts available for saving. "
452
+ "Generate some data or set save_dataset to False to poceed."
453
+ )
454
+
455
+ if not _ndt:
456
+ # byte_stream = io.BytesIO()
457
+ # with h5py.File(byte_stream, "w") as h5f:
458
+ # for _a in self.dataset.__attrs_attrs__:
459
+ # if all((
460
+ # (_arr := getattr(self.dataset, _a.name)).any(),
461
+ # not np.isnan(_arr).all(),
462
+ # )):
463
+ # h5f.create_dataset(_a.name, data=_arr, fletcher32=True)
464
+
465
+ with (zpath / f"{name_root}_dataset.h5").open("wb") as _hfh:
466
+ _hfh.write(self.dataset.to_h5bin())
467
+
468
+ if not _net:
469
+ with (zpath / f"{name_root}_enf_counts.yaml").open("w") as _yfh:
470
+ this_yaml.dump(self.enf_counts, _yfh)
471
+
472
+ def from_archive(
473
+ zip_: zipfile.ZipFile, _subdir: str = "", /, *, restore_dataset: bool = False
474
+ ) -> MarketSample:
475
+ zpath = zipfile.Path(zip_, at=_subdir)
476
+ name_root = f"{_PKG_NAME}_market_sample"
477
+
478
+ market_sample_ = this_yaml.load((zpath / f"{name_root}.yaml").read_text())
479
+
480
+ if restore_dataset:
481
+ if not any((
482
+ (_dt := (_dp := zpath / f"{name_root}_dataset.h5").is_file()),
483
+ (_et := (_ep := zpath / f"{name_root}_enf_counts.yaml").is_file()),
484
+ )):
485
+ raise ValueError(
486
+ "Archive has no sample data to restore. "
487
+ "Delete second argument, or set it False, and rerun."
488
+ )
489
+
490
+ if _dt:
491
+ with _dp.open("rb") as _hfh:
492
+ object.__setattr__( # noqa: PLC2801
493
+ market_sample_,
494
+ "dataset",
495
+ # MarketSampleData(**{_a: h5f[_a][:] for _a in h5f}),
496
+ MarketSampleData.from_h5f(_hfh),
497
+ )
498
+ if _et:
499
+ object.__setattr__( # noqa: PLC2801
500
+ market_sample_, "enf_counts", this_yaml.load(_ep.read_text())
501
+ )
502
+ return market_sample_
503
+
472
504
  @classmethod
473
505
  def to_yaml(
474
- cls, _r: yaml.representer.SafeRepresenter, _d: MarketSample
506
+ cls, _r: yaml.representer.RoundTripRepresenter, _d: MarketSample
475
507
  ) -> yaml.MappingNode:
476
- _ret: yaml.MappingNode = _r.represent_mapping(
508
+ retval: yaml.MappingNode = _r.represent_mapping(
477
509
  f"!{cls.__name__}",
478
510
  {
479
511
  _a.name: getattr(_d, _a.name)
480
512
  for _a in _d.__attrs_attrs__
481
- if _a.name not in ("data", "enf_counts")
513
+ if _a.name not in {"dataset", "enf_counts"}
482
514
  },
483
515
  )
484
- return _ret
516
+ return retval
485
517
 
486
518
  @classmethod
487
519
  def from_yaml(
488
- cls, _c: yaml.constructor.SafeConstructor, _n: yaml.MappingNode
520
+ cls, _c: yaml.constructor.RoundTripConstructor, _n: yaml.MappingNode
489
521
  ) -> MarketSample:
490
- return cls(**_c.construct_mapping(_n))
522
+ return cls(**yaml_rt_mapper(_c, _n))