mergeron 2025.739290.3__py3-none-any.whl → 2025.739290.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mergeron might be problematic. Click here for more details.

@@ -10,25 +10,37 @@ Reported row and column totals from source data are not stored.
10
10
 
11
11
  from __future__ import annotations
12
12
 
13
+ import re
13
14
  import shutil
14
- from collections.abc import Mapping, Sequence
15
- from dataclasses import dataclass
15
+ from collections.abc import Sequence
16
16
  from importlib import resources
17
17
  from operator import itemgetter
18
18
  from pathlib import Path
19
19
  from types import MappingProxyType
20
20
  from typing import Any
21
+ from zipfile import ZIP_DEFLATED, ZipFile
21
22
 
22
- import msgpack # type: ignore
23
23
  import msgpack_numpy as m # type: ignore
24
24
  import numpy as np
25
- import re2 as re # type: ignore
26
25
  import urllib3
27
26
  from bs4 import BeautifulSoup
28
27
  from numpy.testing import assert_array_equal
29
- from ruamel import yaml
30
28
 
31
- from .. import _PKG_NAME, DATA_DIR, EMPTY_ARRAYINT, VERSION, ArrayBIGINT # noqa: TID252
29
+ from .. import ( # noqa: TID252
30
+ _PKG_NAME,
31
+ DATA_DIR,
32
+ EMPTY_ARRAYINT,
33
+ VERSION,
34
+ ArrayBIGINT,
35
+ this_yaml,
36
+ )
37
+ from . import (
38
+ INVData,
39
+ INVData_in_,
40
+ INVTableData,
41
+ _dict_from_mapping,
42
+ _mappingproxy_from_mapping,
43
+ )
32
44
 
33
45
  __version__ = VERSION
34
46
 
@@ -38,7 +50,7 @@ FTCDATA_DIR = DATA_DIR / "FTCData"
38
50
  if not FTCDATA_DIR.is_dir():
39
51
  FTCDATA_DIR.mkdir(parents=True)
40
52
 
41
- INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.msgpack"
53
+ INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.zip"
42
54
  if (
43
55
  not INVDATA_ARCHIVE_PATH.is_file()
44
56
  and (
@@ -93,32 +105,6 @@ CNT_FCOUNT_DICT = {
93
105
  }
94
106
 
95
107
 
96
- @dataclass(slots=True, frozen=True)
97
- class INVTableData:
98
- industry_group: str
99
- additional_evidence: str
100
- data_array: ArrayBIGINT
101
-
102
- @classmethod
103
- def to_yaml(
104
- cls, _r: yaml.representer.SafeRepresenter, _d: INVTableData
105
- ) -> yaml.MappingNode:
106
- _ret: yaml.MappingNode = _r.represent_mapping(
107
- f"!{cls.__name__}", {_a: getattr(_d, _a) for _a in _d.__dataclass_fields__}
108
- )
109
- return _ret
110
-
111
- @classmethod
112
- def from_yaml(
113
- cls, _c: yaml.constructor.SafeConstructor, _n: yaml.MappingNode
114
- ) -> INVTableData:
115
- return cls(**_c.construct_mapping(_n))
116
-
117
-
118
- type INVData = Mapping[str, Mapping[str, Mapping[str, INVTableData]]]
119
- type _INVData_in = dict[str, dict[str, dict[str, INVTableData]]]
120
-
121
-
122
108
  def construct_data(
123
109
  _archive_path: Path = INVDATA_ARCHIVE_PATH,
124
110
  *,
@@ -157,75 +143,83 @@ def construct_data(
157
143
  """
158
144
 
159
145
  if _archive_path.is_file() and not rebuild_data:
160
- _archived_data = msgpack.unpackb(_archive_path.read_bytes(), use_list=False)
161
-
162
- _invdata: _INVData_in = {}
163
- for _period in _archived_data:
164
- _invdata[_period] = {}
165
- for _table_type in _archived_data[_period]:
166
- _invdata[_period][_table_type] = {}
167
- for _table_no in _archived_data[_period][_table_type]:
168
- _invdata[_period][_table_type][_table_no] = INVTableData(
169
- *_archived_data[_period][_table_type][_table_no]
170
- )
171
- return MappingProxyType(_invdata)
146
+ with (
147
+ ZipFile(_archive_path, "r") as _yzh,
148
+ _yzh.open(f"{_archive_path.stem}.yaml", "r") as _yfh,
149
+ ):
150
+ invdata_ = this_yaml.load(_yfh)
151
+ if isinstance(invdata_, MappingProxyType):
152
+ invdata_ = _mappingproxy_from_mapping(invdata_)
153
+ with (
154
+ ZipFile(_archive_path, "w", compression=ZIP_DEFLATED) as _yzh,
155
+ _yzh.open(f"{_archive_path.stem}.yaml", "w") as _yfh,
156
+ ):
157
+ this_yaml.dump(invdata_, _yfh)
158
+ return invdata_
172
159
 
173
- _invdata = dict(_parse_invdata()) # type: ignore # Convert immutable to mutable
160
+ invdata: INVData_in_ = _dict_from_mapping(_parse_invdata())
174
161
 
175
162
  # Add some data periods (
176
163
  # only periods ending in 2011, others have few observations and
177
164
  # some incompatibilities
178
165
  # )
179
- for _data_period in "2004-2011", "2006-2011", "2008-2011":
180
- _invdata_bld = _construct_new_period_data(
181
- _invdata,
182
- _data_period,
166
+ for data_period in "2004-2011", "2006-2011", "2008-2011":
167
+ invdata_bld = _construct_new_period_data(
168
+ invdata,
169
+ data_period,
183
170
  flag_backward_compatibility=flag_backward_compatibility,
184
171
  )
185
- _invdata |= {_data_period: _invdata_bld}
172
+ invdata |= {data_period: invdata_bld}
186
173
 
187
174
  # Create data for industries with no evidence on entry
188
- for _data_period in _invdata:
189
- _construct_no_evidence_data(_invdata, _data_period)
175
+ for data_period in invdata:
176
+ _construct_no_evidence_data(invdata, data_period)
190
177
 
191
178
  # Create a list of exclusions to named industries in the base period,
192
179
  # for construction of aggregate enforcement statistics where feasible
193
- _industry_exclusion_list = (
180
+ industry_exclusion_list = {
194
181
  "AllMarkets",
195
182
  "OtherMarkets",
196
183
  "IndustriesinCommon",
197
184
  "",
198
185
  ("PharmaceuticalsMarkets" if flag_pharma_for_exclusion else None),
199
- )
200
- for _data_period in "1996-2003", "1996-2011", "2004-2011":
201
- for _table_type, _table_no in zip(
186
+ }
187
+
188
+ # Construct aggregate tables
189
+ for data_period in "1996-2003", "1996-2011", "2004-2011":
190
+ for table_type, table_no in zip(
202
191
  TABLE_TYPES, (CONC_TABLE_ALL, CNT_TABLE_ALL), strict=True
203
192
  ):
204
- _invdata_sub_tabletype = _invdata[_data_period][_table_type]
193
+ invdata_sub_tabletype = invdata[data_period][table_type]
205
194
 
206
- _aggr_tables_list = [
207
- _t
208
- for _t in _invdata["1996-2003"][_table_type]
195
+ aggr_tables_list = [
196
+ t_
197
+ for t_ in invdata["1996-2003"][table_type]
209
198
  if re.sub(
210
- r"\W", "", _invdata["1996-2003"][_table_type][_t].industry_group
199
+ r"\W", "", invdata["1996-2003"][table_type][t_].industry_group
211
200
  )
212
- not in _industry_exclusion_list
201
+ not in industry_exclusion_list
213
202
  ]
214
203
 
215
- _invdata_sub_tabletype |= {
216
- _table_no.replace(".1", ".X"): _invdata_build_aggregate_table(
217
- _invdata_sub_tabletype, _aggr_tables_list
204
+ invdata_sub_tabletype |= {
205
+ table_no.replace(".1", ".X"): invdata_build_aggregate_table(
206
+ invdata_sub_tabletype, aggr_tables_list
218
207
  )
219
208
  }
220
209
 
221
- _ = INVDATA_ARCHIVE_PATH.write_bytes(msgpack.packb(_invdata))
210
+ retval: INVData = _mappingproxy_from_mapping(invdata)
211
+ with (
212
+ ZipFile(_archive_path, "w", compression=ZIP_DEFLATED) as _yzh,
213
+ _yzh.open(f"{_archive_path.stem}.yaml", "w") as _yfh,
214
+ ):
215
+ this_yaml.dump(retval, _yfh)
222
216
 
223
- return MappingProxyType(_invdata)
217
+ return retval
224
218
 
225
219
 
226
- def _construct_no_evidence_data(_invdata: _INVData_in, _data_period: str, /) -> None:
227
- _invdata_ind_grp = "All Markets"
228
- _table_nos_map = dict(
220
+ def _construct_no_evidence_data(_invdata: INVData_in_, _data_period: str, /) -> None:
221
+ invdata_ind_grp = "All Markets"
222
+ table_nos_map = dict(
229
223
  zip(
230
224
  (
231
225
  "No Entry Evidence",
@@ -240,28 +234,28 @@ def _construct_no_evidence_data(_invdata: _INVData_in, _data_period: str, /) ->
240
234
  strict=True,
241
235
  )
242
236
  )
243
- for _invdata_evid_cond in (
237
+ for invdata_evid_cond in (
244
238
  "No Entry Evidence",
245
239
  "No Evidence on Customer Complaints",
246
240
  "No Evidence on Hot Documents",
247
241
  ):
248
- for _stats_grp in ("ByHHIandDelta", "ByFirmCount"):
249
- _invdata_sub_evid_cond_conc = _invdata[_data_period][_stats_grp]
242
+ for stats_grp in ("ByHHIandDelta", "ByFirmCount"):
243
+ invdata_sub_evid_cond_conc = _invdata[_data_period][stats_grp]
250
244
 
251
- _dtn = _table_nos_map[_invdata_evid_cond]["ByHHIandDelta"]
252
- _stn0 = "Table 4.1" if _stats_grp == "ByFirmCount" else "Table 3.1"
253
- _stn1, _stn2 = (_dtn.replace(".X", f".{_i}") for _i in ("1", "2"))
245
+ dtn = table_nos_map[invdata_evid_cond]["ByHHIandDelta"]
246
+ stn0 = "Table 4.1" if stats_grp == "ByFirmCount" else "Table 3.1"
247
+ stn1, stn2 = (dtn.replace(".X", f".{_i}") for _i in ("1", "2"))
254
248
 
255
- _invdata_sub_evid_cond_conc |= {
256
- _dtn: INVTableData(
257
- _invdata_ind_grp,
258
- _invdata_evid_cond,
249
+ invdata_sub_evid_cond_conc |= {
250
+ dtn: INVTableData(
251
+ invdata_ind_grp,
252
+ invdata_evid_cond,
259
253
  np.column_stack((
260
- _invdata_sub_evid_cond_conc[_stn0].data_array[:, :2],
254
+ invdata_sub_evid_cond_conc[stn0].data_array[:, :2],
261
255
  (
262
- _invdata_sub_evid_cond_conc[_stn0].data_array[:, 2:]
263
- - _invdata_sub_evid_cond_conc[_stn1].data_array[:, 2:]
264
- - _invdata_sub_evid_cond_conc[_stn2].data_array[:, 2:]
256
+ invdata_sub_evid_cond_conc[stn0].data_array[:, 2:]
257
+ - invdata_sub_evid_cond_conc[stn1].data_array[:, 2:]
258
+ - invdata_sub_evid_cond_conc[stn2].data_array[:, 2:]
265
259
  ),
266
260
  )),
267
261
  )
@@ -275,44 +269,44 @@ def _construct_new_period_data(
275
269
  *,
276
270
  flag_backward_compatibility: bool = False,
277
271
  ) -> dict[str, dict[str, INVTableData]]:
278
- _cuml_period = "1996-{}".format(int(_data_period.split("-")[1]))
279
- if _cuml_period != "1996-2011":
272
+ cuml_period = f"1996-{_data_period.split('-')[1]}"
273
+ if cuml_period != "1996-2011":
280
274
  raise ValueError('Expected cumulative period, "1996-2011"')
281
275
 
282
- _invdata_cuml = _invdata[_cuml_period]
276
+ invdata_cuml = _invdata[cuml_period]
283
277
 
284
- _base_period = "1996-{}".format(int(_data_period.split("-")[0]) - 1)
285
- _invdata_base = _invdata[_base_period]
278
+ base_period = "1996-{}".format(int(_data_period.split("-")[0]) - 1)
279
+ invdata_base = _invdata[base_period]
286
280
 
287
- if tuple(_invdata_cuml.keys()) != TABLE_TYPES:
281
+ if tuple(invdata_cuml.keys()) != TABLE_TYPES:
288
282
  raise ValueError("Source data does not include the expected groups of tables.")
289
283
 
290
- _invdata_bld = {}
291
- for _table_type in TABLE_TYPES:
292
- _data_typesubdict = {}
293
- for _table_no in _invdata_cuml[_table_type]:
294
- _invdata_cuml_sub_table = _invdata_cuml[_table_type][_table_no]
295
- _invdata_ind_group, _invdata_evid_cond, _invdata_cuml_array = (
296
- _invdata_cuml_sub_table.industry_group,
297
- _invdata_cuml_sub_table.additional_evidence,
298
- _invdata_cuml_sub_table.data_array,
284
+ invdata_bld = {}
285
+ for table_type in TABLE_TYPES:
286
+ data_typesubdict = {}
287
+ for table_no in invdata_cuml[table_type]:
288
+ invdata_cuml_sub_table = invdata_cuml[table_type][table_no]
289
+ invdata_ind_group, invdata_evid_cond, invdata_cuml_array = (
290
+ invdata_cuml_sub_table.industry_group,
291
+ invdata_cuml_sub_table.additional_evidence,
292
+ invdata_cuml_sub_table.data_array,
299
293
  )
300
294
 
301
- _invdata_base_sub_table = _invdata_base[_table_type].get(
302
- _table_no, INVTableData("", "", EMPTY_ARRAYINT)
295
+ invdata_base_sub_table = invdata_base[table_type].get(
296
+ table_no, INVTableData("", "", EMPTY_ARRAYINT)
303
297
  )
304
298
 
305
- (_invdata_base_ind_group, _invdata_base_evid_cond, _invdata_base_array) = (
306
- getattr(_invdata_base_sub_table, _a)
299
+ (invdata_base_ind_group, invdata_base_evid_cond, invdata_base_array) = (
300
+ getattr(invdata_base_sub_table, _a)
307
301
  for _a in ("industry_group", "additional_evidence", "data_array")
308
302
  )
309
303
 
310
304
  # Some tables can't be constructed due to inconsistencies in the data
311
305
  # across time periods
312
306
  if (
313
- (_data_period != "2004-2011" and _invdata_ind_group != "All Markets")
314
- or (_invdata_ind_group in ('"Other" Markets', "Industries in Common"))
315
- or (_invdata_base_ind_group in ('"Other" Markets', ""))
307
+ (_data_period != "2004-2011" and invdata_ind_group != "All Markets")
308
+ or (invdata_ind_group in {'"Other" Markets', "Industries in Common"})
309
+ or (invdata_base_ind_group in {'"Other" Markets', ""})
316
310
  ):
317
311
  continue
318
312
 
@@ -328,31 +322,29 @@ def _construct_new_period_data(
328
322
  # The number of "revisions" applied below, for enforcing consistency,
329
323
  # is sufficiently small as to be unlikely to substantially impact
330
324
  # results from analysis of the data.
331
- _invdata_cuml_array_stack = []
332
- _invdata_base_array_stack = []
333
-
334
- for _data_period_detail in _invdata:
335
- _pd_start, _pd_end = (
336
- int(g) for g in _data_period_detail.split("-")
337
- )
338
- if _pd_start == 1996:
339
- _invdata_cuml_array_stack += [
340
- _invdata[_data_period_detail][_table_type][
341
- _table_no
325
+ invdata_cuml_array_stack = []
326
+ invdata_base_array_stack = []
327
+
328
+ for data_period_detail in _invdata:
329
+ pd_start, pd_end = (int(g) for g in data_period_detail.split("-"))
330
+ if pd_start == 1996:
331
+ invdata_cuml_array_stack += [
332
+ _invdata[data_period_detail][table_type][
333
+ table_no
342
334
  ].data_array[:, -3:-1]
343
335
  ]
344
- if _pd_start == 1996 and _pd_end < int(_data_period.split("-")[0]):
345
- _invdata_base_array_stack += [
346
- _invdata[_data_period_detail][_table_type][
347
- _table_no
336
+ if pd_start == 1996 and pd_end < int(_data_period.split("-")[0]):
337
+ invdata_base_array_stack += [
338
+ _invdata[data_period_detail][table_type][
339
+ table_no
348
340
  ].data_array[:, -3:-1]
349
341
  ]
350
- _invdata_cuml_array_enfcls, _invdata_base_array_enfcls = (
342
+ invdata_cuml_array_enfcls, invdata_base_array_enfcls = (
351
343
  np.stack(_f).max(axis=0)
352
- for _f in (_invdata_cuml_array_stack, _invdata_base_array_stack)
344
+ for _f in (invdata_cuml_array_stack, invdata_base_array_stack)
353
345
  )
354
- _invdata_array_bld_enfcls = (
355
- _invdata_cuml_array_enfcls - _invdata_base_array_enfcls
346
+ invdata_array_bld_enfcls = (
347
+ invdata_cuml_array_enfcls - invdata_base_array_enfcls
356
348
  )
357
349
  else:
358
350
  # Consistency here means that the most recent data are considered
@@ -369,59 +361,59 @@ def _construct_new_period_data(
369
361
  # backward compatible due to minor variation in (applying) the criteria
370
362
  # for inclusion, as well as industry coding, undertaken to maintain
371
363
  # transparency on the enforcement process.
372
- _invdata_array_bld_enfcls = (
373
- _invdata_cuml_array[:, -3:-1] - _invdata_base_array[:, -3:-1] # type: ignore
364
+ invdata_array_bld_enfcls = (
365
+ invdata_cuml_array[:, -3:-1] - invdata_base_array[:, -3:-1]
374
366
  )
375
367
 
376
368
  # # // spellchecker: disable
377
369
  # To examine the number of corrected values per table, // spellchecker: disable
378
370
  # uncomment the statements below
379
- # _invdata_array_bld_tbc = where(
380
- # _invdata_array_bld_enfcls < 0, _invdata_array_bld_enfcls, 0
371
+ # invdata_array_bld_tbc = where(
372
+ # invdata_array_bld_enfcls < 0, invdata_array_bld_enfcls, 0
381
373
  # )
382
374
  # if np.einsum('ij->', invdata_array_bld_tbc):
383
375
  # print(
384
- # f"{_data_period}, {_table_no}, {_invdata_ind_group}:",
376
+ # f"{_data_period}, {_table_no}, {invdata_ind_group}:",
385
377
  # abs(np.einsum('ij->', invdata_array_bld_tbc))
386
378
  # )
387
379
  # # // spellchecker: disable
388
380
 
389
381
  # Enforce non-negativity
390
- _invdata_array_bld_enfcls = np.stack((
391
- _invdata_array_bld_enfcls,
392
- np.zeros_like(_invdata_array_bld_enfcls),
382
+ invdata_array_bld_enfcls = np.stack((
383
+ invdata_array_bld_enfcls,
384
+ np.zeros_like(invdata_array_bld_enfcls),
393
385
  )).max(axis=0)
394
386
 
395
- _invdata_array_bld = np.column_stack((
396
- _invdata_cuml_array[:, :-3],
397
- _invdata_array_bld_enfcls,
398
- np.einsum("ij->i", _invdata_array_bld_enfcls),
387
+ invdata_array_bld = np.column_stack((
388
+ invdata_cuml_array[:, :-3],
389
+ invdata_array_bld_enfcls,
390
+ np.einsum("ij->i", invdata_array_bld_enfcls),
399
391
  ))
400
392
 
401
- _data_typesubdict[_table_no] = INVTableData(
402
- _invdata_ind_group, _invdata_evid_cond, _invdata_array_bld
393
+ data_typesubdict[table_no] = INVTableData(
394
+ invdata_ind_group, invdata_evid_cond, invdata_array_bld
403
395
  )
404
- del _invdata_ind_group, _invdata_evid_cond, _invdata_cuml_array
405
- del _invdata_base_ind_group, _invdata_base_evid_cond, _invdata_base_array
406
- del _invdata_array_bld
407
- _invdata_bld[_table_type] = _data_typesubdict
408
- return _invdata_bld
396
+ del invdata_ind_group, invdata_evid_cond, invdata_cuml_array
397
+ del invdata_base_ind_group, invdata_base_evid_cond, invdata_base_array
398
+ del invdata_array_bld
399
+ invdata_bld[table_type] = data_typesubdict
400
+ return invdata_bld
409
401
 
410
402
 
411
- def _invdata_build_aggregate_table(
403
+ def invdata_build_aggregate_table(
412
404
  _data_typesub: dict[str, INVTableData], _aggr_table_list: Sequence[str]
413
405
  ) -> INVTableData:
414
- _hdr_table_no = _aggr_table_list[0]
406
+ hdr_table_no = _aggr_table_list[0]
415
407
 
416
408
  return INVTableData(
417
409
  "Industries in Common",
418
410
  "Unrestricted on additional evidence",
419
411
  np.column_stack((
420
- _data_typesub[_hdr_table_no].data_array[:, :-3],
412
+ _data_typesub[hdr_table_no].data_array[:, :-3],
421
413
  np.einsum(
422
414
  "ijk->jk",
423
415
  np.stack([
424
- (_data_typesub[_t]).data_array[:, -3:] for _t in _aggr_table_list
416
+ (_data_typesub[t_]).data_array[:, -3:] for t_ in _aggr_table_list
425
417
  ]),
426
418
  ),
427
419
  )),
@@ -439,294 +431,295 @@ def _parse_invdata() -> INVData:
439
431
  by range of HHI and ∆HHI.
440
432
 
441
433
  """
442
- raise ValueError(
443
- "This function is defined here as documentation.\n"
444
- "NOTE: License for `pymupdf`, upon which this function depends,"
445
- " may be incompatible with the MIT license,"
446
- " under which this pacakge is distributed."
447
- " Making this fumction operable requires the user to modify"
448
- " the source code as well as to install an additional package"
449
- " not distributed with this package or included in its dependencies."
450
- )
451
- import fitz # type: ignore
452
-
453
- _invdata_docnames = _download_invdata(FTCDATA_DIR)
454
-
455
- _invdata: dict[str, dict[str, dict[str, INVTableData]]] = {}
456
-
457
- for _invdata_docname in _invdata_docnames:
458
- _invdata_pdf_path = FTCDATA_DIR.joinpath(_invdata_docname)
459
-
460
- _invdata_fitz = fitz.open(_invdata_pdf_path)
461
- _invdata_meta = _invdata_fitz.metadata
462
- if _invdata_meta["title"] == " ":
463
- _invdata_meta["title"] = ", ".join((
434
+ # raise ValueError(
435
+ # "This function is defined here as documentation.\n"
436
+ # "NOTE: License for `pymupdf`, upon which this function depends,"
437
+ # " may be incompatible with the MIT license,"
438
+ # " under which this pacakge is distributed."
439
+ # " Making this fumction operable requires the user to modify"
440
+ # " the source code as well as to install an additional package"
441
+ # " not distributed with this package or identified as a requirement."
442
+ # )
443
+ import pymupdf # type: ignore # noqa: PLC0415
444
+
445
+ invdata_docnames = _download_invdata(FTCDATA_DIR)
446
+
447
+ invdata: INVData_in_ = {}
448
+
449
+ for invdata_docname in invdata_docnames:
450
+ invdata_pdf_path = FTCDATA_DIR.joinpath(invdata_docname)
451
+
452
+ invdata_doc = pymupdf.open(invdata_pdf_path)
453
+ invdata_meta = invdata_doc.metadata
454
+ if invdata_meta["title"] == " ":
455
+ invdata_meta["title"] = ", ".join((
464
456
  "Horizontal Merger Investigation Data",
465
457
  "Fiscal Years",
466
458
  "1996-2005",
467
459
  ))
468
460
 
469
- _data_period = re.findall(r"(\d{4}) *(-) *(\d{4})", _invdata_meta["title"])[0]
470
- _data_period = "".join(_data_period)
461
+ data_period = "".join( # line-break here for readability
462
+ re.findall(r"(\d{4}) *(-) *(\d{4})", invdata_meta["title"])[0]
463
+ )
471
464
 
472
465
  # Initialize containers for parsed data
473
- _invdata[_data_period] = {k: {} for k in TABLE_TYPES}
466
+ invdata[data_period] = {k: {} for k in TABLE_TYPES}
474
467
 
475
- for _pdf_pg in _invdata_fitz.pages():
476
- _doc_pg_blocks = _pdf_pg.get_text("blocks", sort=False)
468
+ for pdf_pg in invdata_doc.pages():
469
+ doc_pg_blocks = pdf_pg.get_text("blocks", sort=False)
477
470
  # Across all published reports of FTC investigations data,
478
471
  # sorting lines (PDF page blocks) by the lower coordinates
479
472
  # and then the left coordinates is most effective for
480
473
  # ordering table rows in top-to-bottom order; this doesn't
481
474
  # work for the 1996-2005 data, however, so we resort later
482
- _doc_pg_blocks = sorted([
475
+ doc_pg_blocks = sorted([
483
476
  (f"{_f[3]:03.0f}{_f[0]:03.0f}{_f[1]:03.0f}{_f[2]:03.0f}", *_f)
484
- for _f in _doc_pg_blocks
477
+ for _f in doc_pg_blocks
485
478
  if _f[-1] == 0
486
479
  ])
487
480
 
488
- _data_blocks: list[tuple[str]] = [("",)]
481
+ data_blocks: list[tuple[str]] = [("",)]
489
482
  # Pages layouts not the same in all reports
490
- _pg_hdr_strings = (
483
+ pg_hdr_strings = (
491
484
  "FEDERAL TRADE COMMISSION",
492
485
  "HORIZONTAL MERGER INVESTIGATION DATA: FISCAL YEARS 1996 - 2011",
493
486
  )
494
- if len(_doc_pg_blocks) > 4:
495
- _tnum: re.match = None
496
- for _blk_idx, _pg_blk in enumerate(_doc_pg_blocks):
497
- if _tnum := TABLE_NO_RE.fullmatch(_pg_blk[-3].strip()):
498
- _data_blocks = [
499
- _b
500
- for _b in _doc_pg_blocks
501
- if not _b[-3].startswith(_pg_hdr_strings)
487
+ if len(doc_pg_blocks) > 4:
488
+ tnum = None
489
+ for _pg_blk in doc_pg_blocks:
490
+ if tnum := TABLE_NO_RE.fullmatch(_pg_blk[-3].strip()):
491
+ data_blocks = [
492
+ b_
493
+ for b_ in doc_pg_blocks
494
+ if not b_[-3].startswith(pg_hdr_strings)
502
495
  and (
503
- _b[-3].strip()
504
- not in ("Significant Competitors", "Post Merger HHI")
496
+ b_[-3].strip()
497
+ not in {"Significant Competitors", "Post Merger HHI"}
505
498
  )
506
- and not re.fullmatch(r"\d+", _b[-3].strip())
499
+ and not re.fullmatch(r"\d+", b_[-3].strip())
507
500
  ]
508
501
  break
509
- if not _tnum:
502
+ if not tnum:
510
503
  continue
511
- del _tnum
504
+ del tnum
512
505
  else:
513
506
  continue
514
507
 
515
- _parse_page_blocks(_invdata, _data_period, _data_blocks)
508
+ _parse_page_blocks(invdata, data_period, data_blocks)
516
509
 
517
- _invdata_fitz.close()
510
+ invdata_doc.close()
518
511
 
519
- return MappingProxyType(_invdata)
512
+ return _mappingproxy_from_mapping(invdata)
520
513
 
521
514
 
522
515
  def _parse_page_blocks(
523
- _invdata: _INVData_in, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
516
+ _invdata: INVData_in_, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
524
517
  ) -> None:
525
518
  if _data_period != "1996-2011":
526
519
  _parse_table_blocks(_invdata, _data_period, _doc_pg_blocks)
527
520
  else:
528
- _test_list = [
521
+ test_list = [
529
522
  (g, f[-3].strip())
530
523
  for g, f in enumerate(_doc_pg_blocks)
531
524
  if TABLE_NO_RE.fullmatch(f[-3].strip())
532
525
  ]
533
526
  # In the 1996-2011 report, there are 2 tables per page
534
- if len(_test_list) == 1:
535
- _table_a_blocks = _doc_pg_blocks
536
- _table_b_blocks: Sequence[Sequence[Any]] = []
527
+ if len(test_list) == 1:
528
+ table_a_blocks = _doc_pg_blocks
529
+ table_b_blocks: Sequence[Sequence[Any]] = []
537
530
  else:
538
- _table_a_blocks, _table_b_blocks = (
539
- _doc_pg_blocks[_test_list[0][0] : _test_list[1][0]],
540
- _doc_pg_blocks[_test_list[1][0] :],
531
+ table_a_blocks, table_b_blocks = (
532
+ _doc_pg_blocks[test_list[0][0] : test_list[1][0]],
533
+ _doc_pg_blocks[test_list[1][0] :],
541
534
  )
542
535
 
543
- for _table_i_blocks in _table_a_blocks, _table_b_blocks:
544
- if not _table_i_blocks:
536
+ for table_i_blocks in table_a_blocks, table_b_blocks:
537
+ if not table_i_blocks:
545
538
  continue
546
- _parse_table_blocks(_invdata, _data_period, _table_i_blocks)
539
+ _parse_table_blocks(_invdata, _data_period, table_i_blocks)
547
540
 
548
541
 
549
542
  def _parse_table_blocks(
550
- _invdata: _INVData_in, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
543
+ _invdata: INVData_in_, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
551
544
  ) -> None:
552
- _invdata_evid_cond = "Unrestricted on additional evidence"
553
- _table_num, _table_ser, _table_type = _identify_table_type(
545
+ invdata_evid_cond = "Unrestricted on additional evidence"
546
+ table_num, table_ser, table_type = _identify_table_type(
554
547
  _table_blocks[0][-3].strip()
555
548
  )
556
549
 
557
550
  if _data_period == "1996-2011":
558
- _invdata_ind_group = (
551
+ invdata_ind_group = (
559
552
  _table_blocks[1][-3].split("\n")[1]
560
- if _table_num == "Table 4.8"
553
+ if table_num == "Table 4.8"
561
554
  else _table_blocks[2][-3].split("\n")[0]
562
555
  )
563
556
 
564
- if _table_ser > 4:
565
- _invdata_evid_cond = (
557
+ if table_ser > 4:
558
+ invdata_evid_cond = (
566
559
  _table_blocks[2][-3].split("\n")[1]
567
- if _table_ser in (9, 10)
560
+ if table_ser in {9, 10}
568
561
  else _table_blocks[3][-3].strip()
569
562
  )
570
563
 
571
564
  elif _data_period == "1996-2005":
572
565
  _table_blocks = sorted(_table_blocks, key=itemgetter(6))
573
566
 
574
- _invdata_ind_group = _table_blocks[3][-3].strip()
575
- if _table_ser > 4:
576
- _invdata_evid_cond = _table_blocks[5][-3].strip()
567
+ invdata_ind_group = _table_blocks[3][-3].strip()
568
+ if table_ser > 4:
569
+ invdata_evid_cond = _table_blocks[5][-3].strip()
577
570
 
578
- elif _table_ser % 2 == 0:
579
- _invdata_ind_group = _table_blocks[1][-3].split("\n")[2]
580
- if (_evid_cond_teststr := _table_blocks[2][-3].strip()) == "Outcome":
581
- _invdata_evid_cond = "Unrestricted on additional evidence"
571
+ elif table_ser % 2 == 0:
572
+ invdata_ind_group = _table_blocks[1][-3].split("\n")[2]
573
+ if (evid_cond_teststr := _table_blocks[2][-3].strip()) == "Outcome":
574
+ invdata_evid_cond = "Unrestricted on additional evidence"
582
575
  else:
583
- _invdata_evid_cond = _evid_cond_teststr
576
+ invdata_evid_cond = evid_cond_teststr
584
577
 
585
578
  elif _table_blocks[3][-3].startswith("FTC Horizontal Merger Investigations"):
586
- _invdata_ind_group = _table_blocks[3][-3].split("\n")[2]
587
- _invdata_evid_cond = "Unrestricted on additional evidence"
579
+ invdata_ind_group = _table_blocks[3][-3].split("\n")[2]
580
+ invdata_evid_cond = "Unrestricted on additional evidence"
588
581
 
589
582
  else:
590
583
  # print(_table_blocks)
591
- _invdata_evid_cond = (
584
+ invdata_evid_cond = (
592
585
  _table_blocks[1][-3].strip()
593
- if _table_ser == 9
586
+ if table_ser == 9
594
587
  else _table_blocks[3][-3].strip()
595
588
  )
596
- _invdata_ind_group = _table_blocks[4][-3].split("\n")[2]
589
+ invdata_ind_group = _table_blocks[4][-3].split("\n")[2]
597
590
 
598
- if _invdata_ind_group == "Pharmaceutical Markets":
599
- _invdata_ind_group = "Pharmaceuticals Markets"
591
+ if invdata_ind_group == "Pharmaceutical Markets":
592
+ invdata_ind_group = "Pharmaceuticals Markets"
600
593
 
601
594
  process_table_func = (
602
595
  _process_table_blks_conc_type
603
- if _table_type == TABLE_TYPES[0]
596
+ if table_type == TABLE_TYPES[0]
604
597
  else _process_table_blks_cnt_type
605
598
  )
606
599
 
607
- _table_array = process_table_func(_table_blocks)
608
- if not isinstance(_table_array, np.ndarray) or _table_array.dtype != np.int64:
609
- print(_table_num)
600
+ table_array = process_table_func(_table_blocks)
601
+ if not isinstance(table_array, np.ndarray) or table_array.dtype != np.uint64:
602
+ print(table_num)
610
603
  print(_table_blocks)
611
604
  raise ValueError
612
605
 
613
- _table_data = INVTableData(_invdata_ind_group, _invdata_evid_cond, _table_array)
614
- _invdata[_data_period][_table_type] |= {_table_num: _table_data}
606
+ table_data = INVTableData(invdata_ind_group, invdata_evid_cond, table_array)
607
+ _invdata[_data_period][table_type] |= {table_num: table_data}
615
608
 
616
609
 
617
610
  def _identify_table_type(_tnstr: str = CONC_TABLE_ALL, /) -> tuple[str, int, str]:
618
- _tnum = _tnstr.split(" ")[1]
619
- _tsub = int(_tnum.split(".")[0])
620
- return _tnstr, _tsub, TABLE_TYPES[(_tsub + 1) % 2]
611
+ tnum = _tnstr.split(" ")[1]
612
+ tsub = int(tnum.split(".")[0])
613
+ return _tnstr, tsub, TABLE_TYPES[(tsub + 1) % 2]
621
614
 
622
615
 
623
616
  def _process_table_blks_conc_type(
624
617
  _table_blocks: Sequence[Sequence[str]], /
625
618
  ) -> ArrayBIGINT:
626
- _conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
627
-
628
- _col_titles_array = tuple(CONC_DELTA_DICT.values())
629
- _col_totals: ArrayBIGINT = np.zeros(len(_col_titles_array), np.int64)
630
- _invdata_array: ArrayBIGINT = np.array(None)
631
-
632
- for _tbl_blk in _table_blocks:
633
- if _conc_row_pat.match(_blk_str := _tbl_blk[-3]):
634
- _row_list: list[str] = _blk_str.strip().split("\n")
635
- _row_title: str = _row_list.pop(0)
636
- _row_key: int = CONC_HHI_DICT[_row_title]
637
- _row_total = np.array(_row_list.pop().replace(",", "").split("/"), np.int64)
638
- _row_array_list: list[list[int]] = []
639
- while _row_list:
640
- _enfd_val, _clsd_val = _row_list.pop(0).split("/")
641
- _row_array_list += [
619
+ conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
620
+
621
+ col_titles_array = tuple(CONC_DELTA_DICT.values())
622
+ col_totals: ArrayBIGINT = np.zeros(len(col_titles_array), np.uint64)
623
+ invdata_array: ArrayBIGINT = np.array(None)
624
+
625
+ for tbl_blk in _table_blocks:
626
+ if conc_row_pat.match(_blk_str := tbl_blk[-3]):
627
+ row_list: list[str] = _blk_str.strip().split("\n")
628
+ row_title: str = row_list.pop(0)
629
+ row_key: int = CONC_HHI_DICT[row_title]
630
+ row_total = np.array(row_list.pop().replace(",", "").split("/"), np.uint64)
631
+ row_array_list: list[list[int]] = []
632
+ while row_list:
633
+ enfd_val, clsd_val = row_list.pop(0).split("/")
634
+ row_array_list += [
642
635
  [
643
- _row_key,
644
- _col_titles_array[len(_row_array_list)],
645
- int(_enfd_val),
646
- int(_clsd_val),
647
- int(_enfd_val) + int(_clsd_val),
636
+ row_key,
637
+ col_titles_array[len(row_array_list)],
638
+ int(enfd_val),
639
+ int(clsd_val),
640
+ int(enfd_val) + int(clsd_val),
648
641
  ]
649
642
  ]
650
- _row_array = np.array(_row_array_list, np.int64)
643
+ row_array = np.array(row_array_list, np.uint64)
651
644
  # Check row totals
652
- assert_array_equal(_row_total, np.einsum("ij->j", _row_array[:, 2:4]))
645
+ assert_array_equal(row_total, np.einsum("ij->j", row_array[:, 2:4]))
653
646
 
654
- if _row_key == TTL_KEY:
655
- _col_totals = _row_array
647
+ if row_key == TTL_KEY:
648
+ col_totals = row_array
656
649
  else:
657
- _invdata_array = (
658
- np.vstack((_invdata_array, _row_array))
659
- if _invdata_array.shape
660
- else _row_array
650
+ invdata_array = (
651
+ np.vstack((invdata_array, row_array))
652
+ if invdata_array.shape
653
+ else row_array
661
654
  )
662
- del _row_array, _row_array_list
655
+ del row_array, row_array_list
663
656
  else:
664
657
  continue
665
658
 
666
659
  # Check column totals
667
- for _col_tot in _col_totals:
660
+ for _col_tot in col_totals:
668
661
  assert_array_equal(
669
662
  _col_tot[2:],
670
663
  np.einsum(
671
- "ij->j", _invdata_array[_invdata_array[:, 1] == _col_tot[1]][:, 2:]
664
+ "ij->j", invdata_array[invdata_array[:, 1] == _col_tot[1]][:, 2:]
672
665
  ),
673
666
  )
674
667
 
675
- return _invdata_array[
676
- np.argsort(np.einsum("ij,ij->i", [[100, 1]], _invdata_array[:, :2]))
668
+ return invdata_array[
669
+ np.argsort(np.einsum("ij,ij->i", [[100, 1]], invdata_array[:, :2]))
677
670
  ]
678
671
 
679
672
 
680
673
  def _process_table_blks_cnt_type(
681
674
  _table_blocks: Sequence[Sequence[str]], /
682
675
  ) -> ArrayBIGINT:
683
- _cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
676
+ cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
684
677
 
685
- _invdata_array: ArrayBIGINT = np.array(None)
686
- _col_totals: ArrayBIGINT = np.zeros(3, np.int64) # "enforced", "closed", "total"
678
+ invdata_array: ArrayBIGINT = np.array(None)
679
+ col_totals: ArrayBIGINT = np.zeros(3, np.uint64) # "enforced", "closed", "total"
687
680
 
688
681
  for _tbl_blk in _table_blocks:
689
- if _cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
690
- _row_list_s = _blk_str.strip().replace(",", "").split("\n")
691
- _row_list = np.array(
692
- [CNT_FCOUNT_DICT[_row_list_s[0]], *_row_list_s[1:]], np.int64
682
+ if cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
683
+ row_list_s = _blk_str.strip().replace(",", "").split("\n")
684
+ row_list = np.array(
685
+ [CNT_FCOUNT_DICT[row_list_s[0]], *row_list_s[1:]], np.uint64
693
686
  )
694
- del _row_list_s
695
- if _row_list[3] != _row_list[1] + _row_list[2]:
687
+ del row_list_s
688
+ if row_list[3] != row_list[1] + row_list[2]:
696
689
  raise ValueError(
697
690
  "Total number of investigations does not equal #enforced plus #closed."
698
691
  )
699
- if _row_list[0] == TTL_KEY:
700
- _col_totals = _row_list
692
+ if row_list[0] == TTL_KEY:
693
+ col_totals = row_list
701
694
  else:
702
- _invdata_array = (
703
- np.vstack((_invdata_array, _row_list))
704
- if _invdata_array.shape
705
- else _row_list
695
+ invdata_array = (
696
+ np.vstack((invdata_array, row_list))
697
+ if invdata_array.shape
698
+ else row_list
706
699
  )
707
700
  else:
708
701
  continue
709
702
 
710
703
  if not np.array_equal(
711
- np.array([int(f) for f in _col_totals[1:]], np.int64),
712
- np.einsum("ij->j", _invdata_array[:, 1:]),
704
+ np.array(list(col_totals[1:]), np.uint64),
705
+ np.einsum("ij->j", invdata_array[:, 1:]),
713
706
  ):
714
707
  raise ValueError("Column totals don't compute.")
715
708
 
716
- return _invdata_array[np.argsort(_invdata_array[:, 0])]
709
+ return invdata_array[np.argsort(invdata_array[:, 0])]
717
710
 
718
711
 
719
712
  def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
720
713
  if not _dl_path.is_dir():
721
714
  _dl_path.mkdir(parents=True)
722
715
 
723
- _invdata_homepage_urls = (
716
+ invdata_homepage_urls = (
724
717
  "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
725
718
  "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
726
719
  "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
727
720
  "https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
728
721
  )
729
- _invdata_docnames = (
722
+ invdata_docnames = (
730
723
  "040831horizmergersdata96-03.pdf",
731
724
  "p035603horizmergerinvestigationdata1996-2005.pdf",
732
725
  "081201hsrmergerdata.pdf",
@@ -734,40 +727,40 @@ def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
734
727
  )
735
728
 
736
729
  if all(
737
- _dl_path.joinpath(_invdata_docname).is_file()
738
- for _invdata_docname in _invdata_docnames
730
+ _dl_path.joinpath(invdata_docname).is_file()
731
+ for invdata_docname in invdata_docnames
739
732
  ):
740
- return _invdata_docnames
741
-
742
- _invdata_docnames_dl: tuple[str, ...] = ()
743
- _u3pm = urllib3.PoolManager()
744
- _chunk_size = 1024 * 1024
745
- for _invdata_homepage_url in _invdata_homepage_urls:
746
- with _u3pm.request(
747
- "GET", _invdata_homepage_url, preload_content=False
733
+ return invdata_docnames
734
+
735
+ invdata_docnames_dl: tuple[str, ...] = ()
736
+ u3pm = urllib3.PoolManager()
737
+ chunk_size_ = 1024 * 1024
738
+ for invdata_homepage_url in invdata_homepage_urls:
739
+ with u3pm.request(
740
+ "GET", invdata_homepage_url, preload_content=False
748
741
  ) as _u3handle:
749
- _invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
750
- _invdata_attrs = [
742
+ invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
743
+ invdata_attrs = [
751
744
  (_g.get("title", ""), _g.get("href", ""))
752
- for _g in _invdata_soup.find_all("a")
745
+ for _g in invdata_soup.find_all("a")
753
746
  if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
754
747
  ]
755
- for _invdata_attr in _invdata_attrs:
756
- _invdata_docname, _invdata_link = _invdata_attr
757
- _invdata_docnames_dl += (_invdata_docname,)
748
+ for invdata_attr in invdata_attrs:
749
+ invdata_docname, invdata_link = invdata_attr
750
+ invdata_docnames_dl += (invdata_docname,)
758
751
  with (
759
- _u3pm.request(
760
- "GET", f"https://www.ftc.gov/{_invdata_link}", preload_content=False
752
+ u3pm.request(
753
+ "GET", f"https://www.ftc.gov/{invdata_link}", preload_content=False
761
754
  ) as _urlopen_handle,
762
- _dl_path.joinpath(_invdata_docname).open("wb") as _invdata_fh,
755
+ _dl_path.joinpath(invdata_docname).open("wb") as invdata_fh,
763
756
  ):
764
757
  while True:
765
- _data = _urlopen_handle.read(_chunk_size)
766
- if not _data:
758
+ data = _urlopen_handle.read(chunk_size_)
759
+ if not data:
767
760
  break
768
- _invdata_fh.write(_data)
761
+ invdata_fh.write(data)
769
762
 
770
- return _invdata_docnames_dl
763
+ return invdata_docnames_dl
771
764
 
772
765
 
773
766
  if __name__ == "__main__":