mergeron 2025.739290.2__py3-none-any.whl → 2025.739290.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mergeron might be problematic. Click here for more details.
- mergeron/__init__.py +74 -48
- mergeron/core/__init__.py +105 -4
- mergeron/core/empirical_margin_distribution.py +100 -78
- mergeron/core/ftc_merger_investigations_data.py +309 -316
- mergeron/core/guidelines_boundaries.py +62 -121
- mergeron/core/guidelines_boundary_functions.py +207 -384
- mergeron/core/guidelines_boundary_functions_extra.py +264 -104
- mergeron/core/pseudorandom_numbers.py +76 -67
- mergeron/data/damodaran_margin_data_serialized.zip +0 -0
- mergeron/data/ftc_invdata.zip +0 -0
- mergeron/demo/visualize_empirical_margin_distribution.py +9 -7
- mergeron/gen/__init__.py +123 -161
- mergeron/gen/data_generation.py +183 -149
- mergeron/gen/data_generation_functions.py +220 -237
- mergeron/gen/enforcement_stats.py +83 -115
- mergeron/gen/upp_tests.py +118 -193
- {mergeron-2025.739290.2.dist-info → mergeron-2025.739290.4.dist-info}/METADATA +2 -3
- mergeron-2025.739290.4.dist-info/RECORD +24 -0
- {mergeron-2025.739290.2.dist-info → mergeron-2025.739290.4.dist-info}/WHEEL +1 -1
- mergeron/data/damodaran_margin_data_dict.msgpack +0 -0
- mergeron-2025.739290.2.dist-info/RECORD +0 -23
|
@@ -10,25 +10,37 @@ Reported row and column totals from source data are not stored.
|
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
+
import re
|
|
13
14
|
import shutil
|
|
14
|
-
from collections.abc import
|
|
15
|
-
from dataclasses import dataclass
|
|
15
|
+
from collections.abc import Sequence
|
|
16
16
|
from importlib import resources
|
|
17
17
|
from operator import itemgetter
|
|
18
18
|
from pathlib import Path
|
|
19
19
|
from types import MappingProxyType
|
|
20
20
|
from typing import Any
|
|
21
|
+
from zipfile import ZIP_DEFLATED, ZipFile
|
|
21
22
|
|
|
22
|
-
import msgpack # type: ignore
|
|
23
23
|
import msgpack_numpy as m # type: ignore
|
|
24
24
|
import numpy as np
|
|
25
|
-
import re2 as re # type: ignore
|
|
26
25
|
import urllib3
|
|
27
26
|
from bs4 import BeautifulSoup
|
|
28
27
|
from numpy.testing import assert_array_equal
|
|
29
|
-
from ruamel import yaml
|
|
30
28
|
|
|
31
|
-
from .. import
|
|
29
|
+
from .. import ( # noqa: TID252
|
|
30
|
+
_PKG_NAME,
|
|
31
|
+
DATA_DIR,
|
|
32
|
+
EMPTY_ARRAYINT,
|
|
33
|
+
VERSION,
|
|
34
|
+
ArrayBIGINT,
|
|
35
|
+
this_yaml,
|
|
36
|
+
)
|
|
37
|
+
from . import (
|
|
38
|
+
INVData,
|
|
39
|
+
INVData_in_,
|
|
40
|
+
INVTableData,
|
|
41
|
+
_dict_from_mapping,
|
|
42
|
+
_mappingproxy_from_mapping,
|
|
43
|
+
)
|
|
32
44
|
|
|
33
45
|
__version__ = VERSION
|
|
34
46
|
|
|
@@ -38,7 +50,7 @@ FTCDATA_DIR = DATA_DIR / "FTCData"
|
|
|
38
50
|
if not FTCDATA_DIR.is_dir():
|
|
39
51
|
FTCDATA_DIR.mkdir(parents=True)
|
|
40
52
|
|
|
41
|
-
INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.
|
|
53
|
+
INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.zip"
|
|
42
54
|
if (
|
|
43
55
|
not INVDATA_ARCHIVE_PATH.is_file()
|
|
44
56
|
and (
|
|
@@ -93,32 +105,6 @@ CNT_FCOUNT_DICT = {
|
|
|
93
105
|
}
|
|
94
106
|
|
|
95
107
|
|
|
96
|
-
@dataclass(slots=True, frozen=True)
|
|
97
|
-
class INVTableData:
|
|
98
|
-
industry_group: str
|
|
99
|
-
additional_evidence: str
|
|
100
|
-
data_array: ArrayBIGINT
|
|
101
|
-
|
|
102
|
-
@classmethod
|
|
103
|
-
def to_yaml(
|
|
104
|
-
cls, _r: yaml.representer.SafeRepresenter, _d: INVTableData
|
|
105
|
-
) -> yaml.MappingNode:
|
|
106
|
-
_ret: yaml.MappingNode = _r.represent_mapping(
|
|
107
|
-
f"!{cls.__name__}", {_a: getattr(_d, _a) for _a in _d.__dataclass_fields__}
|
|
108
|
-
)
|
|
109
|
-
return _ret
|
|
110
|
-
|
|
111
|
-
@classmethod
|
|
112
|
-
def from_yaml(
|
|
113
|
-
cls, _c: yaml.constructor.SafeConstructor, _n: yaml.MappingNode
|
|
114
|
-
) -> INVTableData:
|
|
115
|
-
return cls(**_c.construct_mapping(_n))
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
type INVData = Mapping[str, Mapping[str, Mapping[str, INVTableData]]]
|
|
119
|
-
type _INVData_in = dict[str, dict[str, dict[str, INVTableData]]]
|
|
120
|
-
|
|
121
|
-
|
|
122
108
|
def construct_data(
|
|
123
109
|
_archive_path: Path = INVDATA_ARCHIVE_PATH,
|
|
124
110
|
*,
|
|
@@ -157,75 +143,83 @@ def construct_data(
|
|
|
157
143
|
"""
|
|
158
144
|
|
|
159
145
|
if _archive_path.is_file() and not rebuild_data:
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
146
|
+
with (
|
|
147
|
+
ZipFile(_archive_path, "r") as _yzh,
|
|
148
|
+
_yzh.open(f"{_archive_path.stem}.yaml", "r") as _yfh,
|
|
149
|
+
):
|
|
150
|
+
invdata_ = this_yaml.load(_yfh)
|
|
151
|
+
if isinstance(invdata_, MappingProxyType):
|
|
152
|
+
invdata_ = _mappingproxy_from_mapping(invdata_)
|
|
153
|
+
with (
|
|
154
|
+
ZipFile(_archive_path, "w", compression=ZIP_DEFLATED) as _yzh,
|
|
155
|
+
_yzh.open(f"{_archive_path.stem}.yaml", "w") as _yfh,
|
|
156
|
+
):
|
|
157
|
+
this_yaml.dump(invdata_, _yfh)
|
|
158
|
+
return invdata_
|
|
172
159
|
|
|
173
|
-
|
|
160
|
+
invdata: INVData_in_ = _dict_from_mapping(_parse_invdata())
|
|
174
161
|
|
|
175
162
|
# Add some data periods (
|
|
176
163
|
# only periods ending in 2011, others have few observations and
|
|
177
164
|
# some incompatibilities
|
|
178
165
|
# )
|
|
179
|
-
for
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
166
|
+
for data_period in "2004-2011", "2006-2011", "2008-2011":
|
|
167
|
+
invdata_bld = _construct_new_period_data(
|
|
168
|
+
invdata,
|
|
169
|
+
data_period,
|
|
183
170
|
flag_backward_compatibility=flag_backward_compatibility,
|
|
184
171
|
)
|
|
185
|
-
|
|
172
|
+
invdata |= {data_period: invdata_bld}
|
|
186
173
|
|
|
187
174
|
# Create data for industries with no evidence on entry
|
|
188
|
-
for
|
|
189
|
-
_construct_no_evidence_data(
|
|
175
|
+
for data_period in invdata:
|
|
176
|
+
_construct_no_evidence_data(invdata, data_period)
|
|
190
177
|
|
|
191
178
|
# Create a list of exclusions to named industries in the base period,
|
|
192
179
|
# for construction of aggregate enforcement statistics where feasible
|
|
193
|
-
|
|
180
|
+
industry_exclusion_list = {
|
|
194
181
|
"AllMarkets",
|
|
195
182
|
"OtherMarkets",
|
|
196
183
|
"IndustriesinCommon",
|
|
197
184
|
"",
|
|
198
185
|
("PharmaceuticalsMarkets" if flag_pharma_for_exclusion else None),
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
# Construct aggregate tables
|
|
189
|
+
for data_period in "1996-2003", "1996-2011", "2004-2011":
|
|
190
|
+
for table_type, table_no in zip(
|
|
202
191
|
TABLE_TYPES, (CONC_TABLE_ALL, CNT_TABLE_ALL), strict=True
|
|
203
192
|
):
|
|
204
|
-
|
|
193
|
+
invdata_sub_tabletype = invdata[data_period][table_type]
|
|
205
194
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
for
|
|
195
|
+
aggr_tables_list = [
|
|
196
|
+
t_
|
|
197
|
+
for t_ in invdata["1996-2003"][table_type]
|
|
209
198
|
if re.sub(
|
|
210
|
-
r"\W", "",
|
|
199
|
+
r"\W", "", invdata["1996-2003"][table_type][t_].industry_group
|
|
211
200
|
)
|
|
212
|
-
not in
|
|
201
|
+
not in industry_exclusion_list
|
|
213
202
|
]
|
|
214
203
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
204
|
+
invdata_sub_tabletype |= {
|
|
205
|
+
table_no.replace(".1", ".X"): invdata_build_aggregate_table(
|
|
206
|
+
invdata_sub_tabletype, aggr_tables_list
|
|
218
207
|
)
|
|
219
208
|
}
|
|
220
209
|
|
|
221
|
-
|
|
210
|
+
retval: INVData = _mappingproxy_from_mapping(invdata)
|
|
211
|
+
with (
|
|
212
|
+
ZipFile(_archive_path, "w", compression=ZIP_DEFLATED) as _yzh,
|
|
213
|
+
_yzh.open(f"{_archive_path.stem}.yaml", "w") as _yfh,
|
|
214
|
+
):
|
|
215
|
+
this_yaml.dump(retval, _yfh)
|
|
222
216
|
|
|
223
|
-
return
|
|
217
|
+
return retval
|
|
224
218
|
|
|
225
219
|
|
|
226
|
-
def _construct_no_evidence_data(_invdata:
|
|
227
|
-
|
|
228
|
-
|
|
220
|
+
def _construct_no_evidence_data(_invdata: INVData_in_, _data_period: str, /) -> None:
|
|
221
|
+
invdata_ind_grp = "All Markets"
|
|
222
|
+
table_nos_map = dict(
|
|
229
223
|
zip(
|
|
230
224
|
(
|
|
231
225
|
"No Entry Evidence",
|
|
@@ -240,28 +234,28 @@ def _construct_no_evidence_data(_invdata: _INVData_in, _data_period: str, /) ->
|
|
|
240
234
|
strict=True,
|
|
241
235
|
)
|
|
242
236
|
)
|
|
243
|
-
for
|
|
237
|
+
for invdata_evid_cond in (
|
|
244
238
|
"No Entry Evidence",
|
|
245
239
|
"No Evidence on Customer Complaints",
|
|
246
240
|
"No Evidence on Hot Documents",
|
|
247
241
|
):
|
|
248
|
-
for
|
|
249
|
-
|
|
242
|
+
for stats_grp in ("ByHHIandDelta", "ByFirmCount"):
|
|
243
|
+
invdata_sub_evid_cond_conc = _invdata[_data_period][stats_grp]
|
|
250
244
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
245
|
+
dtn = table_nos_map[invdata_evid_cond]["ByHHIandDelta"]
|
|
246
|
+
stn0 = "Table 4.1" if stats_grp == "ByFirmCount" else "Table 3.1"
|
|
247
|
+
stn1, stn2 = (dtn.replace(".X", f".{_i}") for _i in ("1", "2"))
|
|
254
248
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
249
|
+
invdata_sub_evid_cond_conc |= {
|
|
250
|
+
dtn: INVTableData(
|
|
251
|
+
invdata_ind_grp,
|
|
252
|
+
invdata_evid_cond,
|
|
259
253
|
np.column_stack((
|
|
260
|
-
|
|
254
|
+
invdata_sub_evid_cond_conc[stn0].data_array[:, :2],
|
|
261
255
|
(
|
|
262
|
-
|
|
263
|
-
-
|
|
264
|
-
-
|
|
256
|
+
invdata_sub_evid_cond_conc[stn0].data_array[:, 2:]
|
|
257
|
+
- invdata_sub_evid_cond_conc[stn1].data_array[:, 2:]
|
|
258
|
+
- invdata_sub_evid_cond_conc[stn2].data_array[:, 2:]
|
|
265
259
|
),
|
|
266
260
|
)),
|
|
267
261
|
)
|
|
@@ -275,44 +269,44 @@ def _construct_new_period_data(
|
|
|
275
269
|
*,
|
|
276
270
|
flag_backward_compatibility: bool = False,
|
|
277
271
|
) -> dict[str, dict[str, INVTableData]]:
|
|
278
|
-
|
|
279
|
-
if
|
|
272
|
+
cuml_period = f"1996-{_data_period.split('-')[1]}"
|
|
273
|
+
if cuml_period != "1996-2011":
|
|
280
274
|
raise ValueError('Expected cumulative period, "1996-2011"')
|
|
281
275
|
|
|
282
|
-
|
|
276
|
+
invdata_cuml = _invdata[cuml_period]
|
|
283
277
|
|
|
284
|
-
|
|
285
|
-
|
|
278
|
+
base_period = "1996-{}".format(int(_data_period.split("-")[0]) - 1)
|
|
279
|
+
invdata_base = _invdata[base_period]
|
|
286
280
|
|
|
287
|
-
if tuple(
|
|
281
|
+
if tuple(invdata_cuml.keys()) != TABLE_TYPES:
|
|
288
282
|
raise ValueError("Source data does not include the expected groups of tables.")
|
|
289
283
|
|
|
290
|
-
|
|
291
|
-
for
|
|
292
|
-
|
|
293
|
-
for
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
284
|
+
invdata_bld = {}
|
|
285
|
+
for table_type in TABLE_TYPES:
|
|
286
|
+
data_typesubdict = {}
|
|
287
|
+
for table_no in invdata_cuml[table_type]:
|
|
288
|
+
invdata_cuml_sub_table = invdata_cuml[table_type][table_no]
|
|
289
|
+
invdata_ind_group, invdata_evid_cond, invdata_cuml_array = (
|
|
290
|
+
invdata_cuml_sub_table.industry_group,
|
|
291
|
+
invdata_cuml_sub_table.additional_evidence,
|
|
292
|
+
invdata_cuml_sub_table.data_array,
|
|
299
293
|
)
|
|
300
294
|
|
|
301
|
-
|
|
302
|
-
|
|
295
|
+
invdata_base_sub_table = invdata_base[table_type].get(
|
|
296
|
+
table_no, INVTableData("", "", EMPTY_ARRAYINT)
|
|
303
297
|
)
|
|
304
298
|
|
|
305
|
-
(
|
|
306
|
-
getattr(
|
|
299
|
+
(invdata_base_ind_group, invdata_base_evid_cond, invdata_base_array) = (
|
|
300
|
+
getattr(invdata_base_sub_table, _a)
|
|
307
301
|
for _a in ("industry_group", "additional_evidence", "data_array")
|
|
308
302
|
)
|
|
309
303
|
|
|
310
304
|
# Some tables can't be constructed due to inconsistencies in the data
|
|
311
305
|
# across time periods
|
|
312
306
|
if (
|
|
313
|
-
(_data_period != "2004-2011" and
|
|
314
|
-
or (
|
|
315
|
-
or (
|
|
307
|
+
(_data_period != "2004-2011" and invdata_ind_group != "All Markets")
|
|
308
|
+
or (invdata_ind_group in {'"Other" Markets', "Industries in Common"})
|
|
309
|
+
or (invdata_base_ind_group in {'"Other" Markets', ""})
|
|
316
310
|
):
|
|
317
311
|
continue
|
|
318
312
|
|
|
@@ -328,31 +322,29 @@ def _construct_new_period_data(
|
|
|
328
322
|
# The number of "revisions" applied below, for enforcing consistency,
|
|
329
323
|
# is sufficiently small as to be unlikely to substantially impact
|
|
330
324
|
# results from analysis of the data.
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
for
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
_invdata[_data_period_detail][_table_type][
|
|
341
|
-
_table_no
|
|
325
|
+
invdata_cuml_array_stack = []
|
|
326
|
+
invdata_base_array_stack = []
|
|
327
|
+
|
|
328
|
+
for data_period_detail in _invdata:
|
|
329
|
+
pd_start, pd_end = (int(g) for g in data_period_detail.split("-"))
|
|
330
|
+
if pd_start == 1996:
|
|
331
|
+
invdata_cuml_array_stack += [
|
|
332
|
+
_invdata[data_period_detail][table_type][
|
|
333
|
+
table_no
|
|
342
334
|
].data_array[:, -3:-1]
|
|
343
335
|
]
|
|
344
|
-
if
|
|
345
|
-
|
|
346
|
-
_invdata[
|
|
347
|
-
|
|
336
|
+
if pd_start == 1996 and pd_end < int(_data_period.split("-")[0]):
|
|
337
|
+
invdata_base_array_stack += [
|
|
338
|
+
_invdata[data_period_detail][table_type][
|
|
339
|
+
table_no
|
|
348
340
|
].data_array[:, -3:-1]
|
|
349
341
|
]
|
|
350
|
-
|
|
342
|
+
invdata_cuml_array_enfcls, invdata_base_array_enfcls = (
|
|
351
343
|
np.stack(_f).max(axis=0)
|
|
352
|
-
for _f in (
|
|
344
|
+
for _f in (invdata_cuml_array_stack, invdata_base_array_stack)
|
|
353
345
|
)
|
|
354
|
-
|
|
355
|
-
|
|
346
|
+
invdata_array_bld_enfcls = (
|
|
347
|
+
invdata_cuml_array_enfcls - invdata_base_array_enfcls
|
|
356
348
|
)
|
|
357
349
|
else:
|
|
358
350
|
# Consistency here means that the most recent data are considered
|
|
@@ -369,59 +361,59 @@ def _construct_new_period_data(
|
|
|
369
361
|
# backward compatible due to minor variation in (applying) the criteria
|
|
370
362
|
# for inclusion, as well as industry coding, undertaken to maintain
|
|
371
363
|
# transparency on the enforcement process.
|
|
372
|
-
|
|
373
|
-
|
|
364
|
+
invdata_array_bld_enfcls = (
|
|
365
|
+
invdata_cuml_array[:, -3:-1] - invdata_base_array[:, -3:-1]
|
|
374
366
|
)
|
|
375
367
|
|
|
376
368
|
# # // spellchecker: disable
|
|
377
369
|
# To examine the number of corrected values per table, // spellchecker: disable
|
|
378
370
|
# uncomment the statements below
|
|
379
|
-
#
|
|
380
|
-
#
|
|
371
|
+
# invdata_array_bld_tbc = where(
|
|
372
|
+
# invdata_array_bld_enfcls < 0, invdata_array_bld_enfcls, 0
|
|
381
373
|
# )
|
|
382
374
|
# if np.einsum('ij->', invdata_array_bld_tbc):
|
|
383
375
|
# print(
|
|
384
|
-
# f"{_data_period}, {_table_no}, {
|
|
376
|
+
# f"{_data_period}, {_table_no}, {invdata_ind_group}:",
|
|
385
377
|
# abs(np.einsum('ij->', invdata_array_bld_tbc))
|
|
386
378
|
# )
|
|
387
379
|
# # // spellchecker: disable
|
|
388
380
|
|
|
389
381
|
# Enforce non-negativity
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
np.zeros_like(
|
|
382
|
+
invdata_array_bld_enfcls = np.stack((
|
|
383
|
+
invdata_array_bld_enfcls,
|
|
384
|
+
np.zeros_like(invdata_array_bld_enfcls),
|
|
393
385
|
)).max(axis=0)
|
|
394
386
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
np.einsum("ij->i",
|
|
387
|
+
invdata_array_bld = np.column_stack((
|
|
388
|
+
invdata_cuml_array[:, :-3],
|
|
389
|
+
invdata_array_bld_enfcls,
|
|
390
|
+
np.einsum("ij->i", invdata_array_bld_enfcls),
|
|
399
391
|
))
|
|
400
392
|
|
|
401
|
-
|
|
402
|
-
|
|
393
|
+
data_typesubdict[table_no] = INVTableData(
|
|
394
|
+
invdata_ind_group, invdata_evid_cond, invdata_array_bld
|
|
403
395
|
)
|
|
404
|
-
del
|
|
405
|
-
del
|
|
406
|
-
del
|
|
407
|
-
|
|
408
|
-
return
|
|
396
|
+
del invdata_ind_group, invdata_evid_cond, invdata_cuml_array
|
|
397
|
+
del invdata_base_ind_group, invdata_base_evid_cond, invdata_base_array
|
|
398
|
+
del invdata_array_bld
|
|
399
|
+
invdata_bld[table_type] = data_typesubdict
|
|
400
|
+
return invdata_bld
|
|
409
401
|
|
|
410
402
|
|
|
411
|
-
def
|
|
403
|
+
def invdata_build_aggregate_table(
|
|
412
404
|
_data_typesub: dict[str, INVTableData], _aggr_table_list: Sequence[str]
|
|
413
405
|
) -> INVTableData:
|
|
414
|
-
|
|
406
|
+
hdr_table_no = _aggr_table_list[0]
|
|
415
407
|
|
|
416
408
|
return INVTableData(
|
|
417
409
|
"Industries in Common",
|
|
418
410
|
"Unrestricted on additional evidence",
|
|
419
411
|
np.column_stack((
|
|
420
|
-
_data_typesub[
|
|
412
|
+
_data_typesub[hdr_table_no].data_array[:, :-3],
|
|
421
413
|
np.einsum(
|
|
422
414
|
"ijk->jk",
|
|
423
415
|
np.stack([
|
|
424
|
-
(_data_typesub[
|
|
416
|
+
(_data_typesub[t_]).data_array[:, -3:] for t_ in _aggr_table_list
|
|
425
417
|
]),
|
|
426
418
|
),
|
|
427
419
|
)),
|
|
@@ -439,294 +431,295 @@ def _parse_invdata() -> INVData:
|
|
|
439
431
|
by range of HHI and ∆HHI.
|
|
440
432
|
|
|
441
433
|
"""
|
|
442
|
-
raise ValueError(
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
)
|
|
451
|
-
import
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
for
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
if
|
|
463
|
-
|
|
434
|
+
# raise ValueError(
|
|
435
|
+
# "This function is defined here as documentation.\n"
|
|
436
|
+
# "NOTE: License for `pymupdf`, upon which this function depends,"
|
|
437
|
+
# " may be incompatible with the MIT license,"
|
|
438
|
+
# " under which this pacakge is distributed."
|
|
439
|
+
# " Making this fumction operable requires the user to modify"
|
|
440
|
+
# " the source code as well as to install an additional package"
|
|
441
|
+
# " not distributed with this package or identified as a requirement."
|
|
442
|
+
# )
|
|
443
|
+
import pymupdf # type: ignore # noqa: PLC0415
|
|
444
|
+
|
|
445
|
+
invdata_docnames = _download_invdata(FTCDATA_DIR)
|
|
446
|
+
|
|
447
|
+
invdata: INVData_in_ = {}
|
|
448
|
+
|
|
449
|
+
for invdata_docname in invdata_docnames:
|
|
450
|
+
invdata_pdf_path = FTCDATA_DIR.joinpath(invdata_docname)
|
|
451
|
+
|
|
452
|
+
invdata_doc = pymupdf.open(invdata_pdf_path)
|
|
453
|
+
invdata_meta = invdata_doc.metadata
|
|
454
|
+
if invdata_meta["title"] == " ":
|
|
455
|
+
invdata_meta["title"] = ", ".join((
|
|
464
456
|
"Horizontal Merger Investigation Data",
|
|
465
457
|
"Fiscal Years",
|
|
466
458
|
"1996-2005",
|
|
467
459
|
))
|
|
468
460
|
|
|
469
|
-
|
|
470
|
-
|
|
461
|
+
data_period = "".join( # line-break here for readability
|
|
462
|
+
re.findall(r"(\d{4}) *(-) *(\d{4})", invdata_meta["title"])[0]
|
|
463
|
+
)
|
|
471
464
|
|
|
472
465
|
# Initialize containers for parsed data
|
|
473
|
-
|
|
466
|
+
invdata[data_period] = {k: {} for k in TABLE_TYPES}
|
|
474
467
|
|
|
475
|
-
for
|
|
476
|
-
|
|
468
|
+
for pdf_pg in invdata_doc.pages():
|
|
469
|
+
doc_pg_blocks = pdf_pg.get_text("blocks", sort=False)
|
|
477
470
|
# Across all published reports of FTC investigations data,
|
|
478
471
|
# sorting lines (PDF page blocks) by the lower coordinates
|
|
479
472
|
# and then the left coordinates is most effective for
|
|
480
473
|
# ordering table rows in top-to-bottom order; this doesn't
|
|
481
474
|
# work for the 1996-2005 data, however, so we resort later
|
|
482
|
-
|
|
475
|
+
doc_pg_blocks = sorted([
|
|
483
476
|
(f"{_f[3]:03.0f}{_f[0]:03.0f}{_f[1]:03.0f}{_f[2]:03.0f}", *_f)
|
|
484
|
-
for _f in
|
|
477
|
+
for _f in doc_pg_blocks
|
|
485
478
|
if _f[-1] == 0
|
|
486
479
|
])
|
|
487
480
|
|
|
488
|
-
|
|
481
|
+
data_blocks: list[tuple[str]] = [("",)]
|
|
489
482
|
# Pages layouts not the same in all reports
|
|
490
|
-
|
|
483
|
+
pg_hdr_strings = (
|
|
491
484
|
"FEDERAL TRADE COMMISSION",
|
|
492
485
|
"HORIZONTAL MERGER INVESTIGATION DATA: FISCAL YEARS 1996 - 2011",
|
|
493
486
|
)
|
|
494
|
-
if len(
|
|
495
|
-
|
|
496
|
-
for
|
|
497
|
-
if
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
for
|
|
501
|
-
if not
|
|
487
|
+
if len(doc_pg_blocks) > 4:
|
|
488
|
+
tnum = None
|
|
489
|
+
for _pg_blk in doc_pg_blocks:
|
|
490
|
+
if tnum := TABLE_NO_RE.fullmatch(_pg_blk[-3].strip()):
|
|
491
|
+
data_blocks = [
|
|
492
|
+
b_
|
|
493
|
+
for b_ in doc_pg_blocks
|
|
494
|
+
if not b_[-3].startswith(pg_hdr_strings)
|
|
502
495
|
and (
|
|
503
|
-
|
|
504
|
-
not in
|
|
496
|
+
b_[-3].strip()
|
|
497
|
+
not in {"Significant Competitors", "Post Merger HHI"}
|
|
505
498
|
)
|
|
506
|
-
and not re.fullmatch(r"\d+",
|
|
499
|
+
and not re.fullmatch(r"\d+", b_[-3].strip())
|
|
507
500
|
]
|
|
508
501
|
break
|
|
509
|
-
if not
|
|
502
|
+
if not tnum:
|
|
510
503
|
continue
|
|
511
|
-
del
|
|
504
|
+
del tnum
|
|
512
505
|
else:
|
|
513
506
|
continue
|
|
514
507
|
|
|
515
|
-
_parse_page_blocks(
|
|
508
|
+
_parse_page_blocks(invdata, data_period, data_blocks)
|
|
516
509
|
|
|
517
|
-
|
|
510
|
+
invdata_doc.close()
|
|
518
511
|
|
|
519
|
-
return
|
|
512
|
+
return _mappingproxy_from_mapping(invdata)
|
|
520
513
|
|
|
521
514
|
|
|
522
515
|
def _parse_page_blocks(
|
|
523
|
-
_invdata:
|
|
516
|
+
_invdata: INVData_in_, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
|
|
524
517
|
) -> None:
|
|
525
518
|
if _data_period != "1996-2011":
|
|
526
519
|
_parse_table_blocks(_invdata, _data_period, _doc_pg_blocks)
|
|
527
520
|
else:
|
|
528
|
-
|
|
521
|
+
test_list = [
|
|
529
522
|
(g, f[-3].strip())
|
|
530
523
|
for g, f in enumerate(_doc_pg_blocks)
|
|
531
524
|
if TABLE_NO_RE.fullmatch(f[-3].strip())
|
|
532
525
|
]
|
|
533
526
|
# In the 1996-2011 report, there are 2 tables per page
|
|
534
|
-
if len(
|
|
535
|
-
|
|
536
|
-
|
|
527
|
+
if len(test_list) == 1:
|
|
528
|
+
table_a_blocks = _doc_pg_blocks
|
|
529
|
+
table_b_blocks: Sequence[Sequence[Any]] = []
|
|
537
530
|
else:
|
|
538
|
-
|
|
539
|
-
_doc_pg_blocks[
|
|
540
|
-
_doc_pg_blocks[
|
|
531
|
+
table_a_blocks, table_b_blocks = (
|
|
532
|
+
_doc_pg_blocks[test_list[0][0] : test_list[1][0]],
|
|
533
|
+
_doc_pg_blocks[test_list[1][0] :],
|
|
541
534
|
)
|
|
542
535
|
|
|
543
|
-
for
|
|
544
|
-
if not
|
|
536
|
+
for table_i_blocks in table_a_blocks, table_b_blocks:
|
|
537
|
+
if not table_i_blocks:
|
|
545
538
|
continue
|
|
546
|
-
_parse_table_blocks(_invdata, _data_period,
|
|
539
|
+
_parse_table_blocks(_invdata, _data_period, table_i_blocks)
|
|
547
540
|
|
|
548
541
|
|
|
549
542
|
def _parse_table_blocks(
|
|
550
|
-
_invdata:
|
|
543
|
+
_invdata: INVData_in_, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
|
|
551
544
|
) -> None:
|
|
552
|
-
|
|
553
|
-
|
|
545
|
+
invdata_evid_cond = "Unrestricted on additional evidence"
|
|
546
|
+
table_num, table_ser, table_type = _identify_table_type(
|
|
554
547
|
_table_blocks[0][-3].strip()
|
|
555
548
|
)
|
|
556
549
|
|
|
557
550
|
if _data_period == "1996-2011":
|
|
558
|
-
|
|
551
|
+
invdata_ind_group = (
|
|
559
552
|
_table_blocks[1][-3].split("\n")[1]
|
|
560
|
-
if
|
|
553
|
+
if table_num == "Table 4.8"
|
|
561
554
|
else _table_blocks[2][-3].split("\n")[0]
|
|
562
555
|
)
|
|
563
556
|
|
|
564
|
-
if
|
|
565
|
-
|
|
557
|
+
if table_ser > 4:
|
|
558
|
+
invdata_evid_cond = (
|
|
566
559
|
_table_blocks[2][-3].split("\n")[1]
|
|
567
|
-
if
|
|
560
|
+
if table_ser in {9, 10}
|
|
568
561
|
else _table_blocks[3][-3].strip()
|
|
569
562
|
)
|
|
570
563
|
|
|
571
564
|
elif _data_period == "1996-2005":
|
|
572
565
|
_table_blocks = sorted(_table_blocks, key=itemgetter(6))
|
|
573
566
|
|
|
574
|
-
|
|
575
|
-
if
|
|
576
|
-
|
|
567
|
+
invdata_ind_group = _table_blocks[3][-3].strip()
|
|
568
|
+
if table_ser > 4:
|
|
569
|
+
invdata_evid_cond = _table_blocks[5][-3].strip()
|
|
577
570
|
|
|
578
|
-
elif
|
|
579
|
-
|
|
580
|
-
if (
|
|
581
|
-
|
|
571
|
+
elif table_ser % 2 == 0:
|
|
572
|
+
invdata_ind_group = _table_blocks[1][-3].split("\n")[2]
|
|
573
|
+
if (evid_cond_teststr := _table_blocks[2][-3].strip()) == "Outcome":
|
|
574
|
+
invdata_evid_cond = "Unrestricted on additional evidence"
|
|
582
575
|
else:
|
|
583
|
-
|
|
576
|
+
invdata_evid_cond = evid_cond_teststr
|
|
584
577
|
|
|
585
578
|
elif _table_blocks[3][-3].startswith("FTC Horizontal Merger Investigations"):
|
|
586
|
-
|
|
587
|
-
|
|
579
|
+
invdata_ind_group = _table_blocks[3][-3].split("\n")[2]
|
|
580
|
+
invdata_evid_cond = "Unrestricted on additional evidence"
|
|
588
581
|
|
|
589
582
|
else:
|
|
590
583
|
# print(_table_blocks)
|
|
591
|
-
|
|
584
|
+
invdata_evid_cond = (
|
|
592
585
|
_table_blocks[1][-3].strip()
|
|
593
|
-
if
|
|
586
|
+
if table_ser == 9
|
|
594
587
|
else _table_blocks[3][-3].strip()
|
|
595
588
|
)
|
|
596
|
-
|
|
589
|
+
invdata_ind_group = _table_blocks[4][-3].split("\n")[2]
|
|
597
590
|
|
|
598
|
-
if
|
|
599
|
-
|
|
591
|
+
if invdata_ind_group == "Pharmaceutical Markets":
|
|
592
|
+
invdata_ind_group = "Pharmaceuticals Markets"
|
|
600
593
|
|
|
601
594
|
process_table_func = (
|
|
602
595
|
_process_table_blks_conc_type
|
|
603
|
-
if
|
|
596
|
+
if table_type == TABLE_TYPES[0]
|
|
604
597
|
else _process_table_blks_cnt_type
|
|
605
598
|
)
|
|
606
599
|
|
|
607
|
-
|
|
608
|
-
if not isinstance(
|
|
609
|
-
print(
|
|
600
|
+
table_array = process_table_func(_table_blocks)
|
|
601
|
+
if not isinstance(table_array, np.ndarray) or table_array.dtype != np.uint64:
|
|
602
|
+
print(table_num)
|
|
610
603
|
print(_table_blocks)
|
|
611
604
|
raise ValueError
|
|
612
605
|
|
|
613
|
-
|
|
614
|
-
_invdata[_data_period][
|
|
606
|
+
table_data = INVTableData(invdata_ind_group, invdata_evid_cond, table_array)
|
|
607
|
+
_invdata[_data_period][table_type] |= {table_num: table_data}
|
|
615
608
|
|
|
616
609
|
|
|
617
610
|
def _identify_table_type(_tnstr: str = CONC_TABLE_ALL, /) -> tuple[str, int, str]:
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
return _tnstr,
|
|
611
|
+
tnum = _tnstr.split(" ")[1]
|
|
612
|
+
tsub = int(tnum.split(".")[0])
|
|
613
|
+
return _tnstr, tsub, TABLE_TYPES[(tsub + 1) % 2]
|
|
621
614
|
|
|
622
615
|
|
|
623
616
|
def _process_table_blks_conc_type(
|
|
624
617
|
_table_blocks: Sequence[Sequence[str]], /
|
|
625
618
|
) -> ArrayBIGINT:
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
for
|
|
633
|
-
if
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
while
|
|
640
|
-
|
|
641
|
-
|
|
619
|
+
conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
|
|
620
|
+
|
|
621
|
+
col_titles_array = tuple(CONC_DELTA_DICT.values())
|
|
622
|
+
col_totals: ArrayBIGINT = np.zeros(len(col_titles_array), np.uint64)
|
|
623
|
+
invdata_array: ArrayBIGINT = np.array(None)
|
|
624
|
+
|
|
625
|
+
for tbl_blk in _table_blocks:
|
|
626
|
+
if conc_row_pat.match(_blk_str := tbl_blk[-3]):
|
|
627
|
+
row_list: list[str] = _blk_str.strip().split("\n")
|
|
628
|
+
row_title: str = row_list.pop(0)
|
|
629
|
+
row_key: int = CONC_HHI_DICT[row_title]
|
|
630
|
+
row_total = np.array(row_list.pop().replace(",", "").split("/"), np.uint64)
|
|
631
|
+
row_array_list: list[list[int]] = []
|
|
632
|
+
while row_list:
|
|
633
|
+
enfd_val, clsd_val = row_list.pop(0).split("/")
|
|
634
|
+
row_array_list += [
|
|
642
635
|
[
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
int(
|
|
646
|
-
int(
|
|
647
|
-
int(
|
|
636
|
+
row_key,
|
|
637
|
+
col_titles_array[len(row_array_list)],
|
|
638
|
+
int(enfd_val),
|
|
639
|
+
int(clsd_val),
|
|
640
|
+
int(enfd_val) + int(clsd_val),
|
|
648
641
|
]
|
|
649
642
|
]
|
|
650
|
-
|
|
643
|
+
row_array = np.array(row_array_list, np.uint64)
|
|
651
644
|
# Check row totals
|
|
652
|
-
assert_array_equal(
|
|
645
|
+
assert_array_equal(row_total, np.einsum("ij->j", row_array[:, 2:4]))
|
|
653
646
|
|
|
654
|
-
if
|
|
655
|
-
|
|
647
|
+
if row_key == TTL_KEY:
|
|
648
|
+
col_totals = row_array
|
|
656
649
|
else:
|
|
657
|
-
|
|
658
|
-
np.vstack((
|
|
659
|
-
if
|
|
660
|
-
else
|
|
650
|
+
invdata_array = (
|
|
651
|
+
np.vstack((invdata_array, row_array))
|
|
652
|
+
if invdata_array.shape
|
|
653
|
+
else row_array
|
|
661
654
|
)
|
|
662
|
-
del
|
|
655
|
+
del row_array, row_array_list
|
|
663
656
|
else:
|
|
664
657
|
continue
|
|
665
658
|
|
|
666
659
|
# Check column totals
|
|
667
|
-
for _col_tot in
|
|
660
|
+
for _col_tot in col_totals:
|
|
668
661
|
assert_array_equal(
|
|
669
662
|
_col_tot[2:],
|
|
670
663
|
np.einsum(
|
|
671
|
-
"ij->j",
|
|
664
|
+
"ij->j", invdata_array[invdata_array[:, 1] == _col_tot[1]][:, 2:]
|
|
672
665
|
),
|
|
673
666
|
)
|
|
674
667
|
|
|
675
|
-
return
|
|
676
|
-
np.argsort(np.einsum("ij,ij->i", [[100, 1]],
|
|
668
|
+
return invdata_array[
|
|
669
|
+
np.argsort(np.einsum("ij,ij->i", [[100, 1]], invdata_array[:, :2]))
|
|
677
670
|
]
|
|
678
671
|
|
|
679
672
|
|
|
680
673
|
def _process_table_blks_cnt_type(
|
|
681
674
|
_table_blocks: Sequence[Sequence[str]], /
|
|
682
675
|
) -> ArrayBIGINT:
|
|
683
|
-
|
|
676
|
+
cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
|
|
684
677
|
|
|
685
|
-
|
|
686
|
-
|
|
678
|
+
invdata_array: ArrayBIGINT = np.array(None)
|
|
679
|
+
col_totals: ArrayBIGINT = np.zeros(3, np.uint64) # "enforced", "closed", "total"
|
|
687
680
|
|
|
688
681
|
for _tbl_blk in _table_blocks:
|
|
689
|
-
if
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
[CNT_FCOUNT_DICT[
|
|
682
|
+
if cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
|
|
683
|
+
row_list_s = _blk_str.strip().replace(",", "").split("\n")
|
|
684
|
+
row_list = np.array(
|
|
685
|
+
[CNT_FCOUNT_DICT[row_list_s[0]], *row_list_s[1:]], np.uint64
|
|
693
686
|
)
|
|
694
|
-
del
|
|
695
|
-
if
|
|
687
|
+
del row_list_s
|
|
688
|
+
if row_list[3] != row_list[1] + row_list[2]:
|
|
696
689
|
raise ValueError(
|
|
697
690
|
"Total number of investigations does not equal #enforced plus #closed."
|
|
698
691
|
)
|
|
699
|
-
if
|
|
700
|
-
|
|
692
|
+
if row_list[0] == TTL_KEY:
|
|
693
|
+
col_totals = row_list
|
|
701
694
|
else:
|
|
702
|
-
|
|
703
|
-
np.vstack((
|
|
704
|
-
if
|
|
705
|
-
else
|
|
695
|
+
invdata_array = (
|
|
696
|
+
np.vstack((invdata_array, row_list))
|
|
697
|
+
if invdata_array.shape
|
|
698
|
+
else row_list
|
|
706
699
|
)
|
|
707
700
|
else:
|
|
708
701
|
continue
|
|
709
702
|
|
|
710
703
|
if not np.array_equal(
|
|
711
|
-
np.array(
|
|
712
|
-
np.einsum("ij->j",
|
|
704
|
+
np.array(list(col_totals[1:]), np.uint64),
|
|
705
|
+
np.einsum("ij->j", invdata_array[:, 1:]),
|
|
713
706
|
):
|
|
714
707
|
raise ValueError("Column totals don't compute.")
|
|
715
708
|
|
|
716
|
-
return
|
|
709
|
+
return invdata_array[np.argsort(invdata_array[:, 0])]
|
|
717
710
|
|
|
718
711
|
|
|
719
712
|
def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
|
|
720
713
|
if not _dl_path.is_dir():
|
|
721
714
|
_dl_path.mkdir(parents=True)
|
|
722
715
|
|
|
723
|
-
|
|
716
|
+
invdata_homepage_urls = (
|
|
724
717
|
"https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
|
|
725
718
|
"https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
|
|
726
719
|
"https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
|
|
727
720
|
"https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
|
|
728
721
|
)
|
|
729
|
-
|
|
722
|
+
invdata_docnames = (
|
|
730
723
|
"040831horizmergersdata96-03.pdf",
|
|
731
724
|
"p035603horizmergerinvestigationdata1996-2005.pdf",
|
|
732
725
|
"081201hsrmergerdata.pdf",
|
|
@@ -734,40 +727,40 @@ def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
|
|
|
734
727
|
)
|
|
735
728
|
|
|
736
729
|
if all(
|
|
737
|
-
_dl_path.joinpath(
|
|
738
|
-
for
|
|
730
|
+
_dl_path.joinpath(invdata_docname).is_file()
|
|
731
|
+
for invdata_docname in invdata_docnames
|
|
739
732
|
):
|
|
740
|
-
return
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
for
|
|
746
|
-
with
|
|
747
|
-
"GET",
|
|
733
|
+
return invdata_docnames
|
|
734
|
+
|
|
735
|
+
invdata_docnames_dl: tuple[str, ...] = ()
|
|
736
|
+
u3pm = urllib3.PoolManager()
|
|
737
|
+
chunk_size_ = 1024 * 1024
|
|
738
|
+
for invdata_homepage_url in invdata_homepage_urls:
|
|
739
|
+
with u3pm.request(
|
|
740
|
+
"GET", invdata_homepage_url, preload_content=False
|
|
748
741
|
) as _u3handle:
|
|
749
|
-
|
|
750
|
-
|
|
742
|
+
invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
|
|
743
|
+
invdata_attrs = [
|
|
751
744
|
(_g.get("title", ""), _g.get("href", ""))
|
|
752
|
-
for _g in
|
|
745
|
+
for _g in invdata_soup.find_all("a")
|
|
753
746
|
if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
|
|
754
747
|
]
|
|
755
|
-
for
|
|
756
|
-
|
|
757
|
-
|
|
748
|
+
for invdata_attr in invdata_attrs:
|
|
749
|
+
invdata_docname, invdata_link = invdata_attr
|
|
750
|
+
invdata_docnames_dl += (invdata_docname,)
|
|
758
751
|
with (
|
|
759
|
-
|
|
760
|
-
"GET", f"https://www.ftc.gov/{
|
|
752
|
+
u3pm.request(
|
|
753
|
+
"GET", f"https://www.ftc.gov/{invdata_link}", preload_content=False
|
|
761
754
|
) as _urlopen_handle,
|
|
762
|
-
_dl_path.joinpath(
|
|
755
|
+
_dl_path.joinpath(invdata_docname).open("wb") as invdata_fh,
|
|
763
756
|
):
|
|
764
757
|
while True:
|
|
765
|
-
|
|
766
|
-
if not
|
|
758
|
+
data = _urlopen_handle.read(chunk_size_)
|
|
759
|
+
if not data:
|
|
767
760
|
break
|
|
768
|
-
|
|
761
|
+
invdata_fh.write(data)
|
|
769
762
|
|
|
770
|
-
return
|
|
763
|
+
return invdata_docnames_dl
|
|
771
764
|
|
|
772
765
|
|
|
773
766
|
if __name__ == "__main__":
|