mergeron 2024.738953.1__py3-none-any.whl → 2025.739265.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mergeron might be problematic. Click here for more details.
- mergeron/__init__.py +26 -6
- mergeron/core/__init__.py +5 -65
- mergeron/core/{damodaran_margin_data.py → empirical_margin_distribution.py} +74 -58
- mergeron/core/ftc_merger_investigations_data.py +147 -101
- mergeron/core/guidelines_boundaries.py +290 -1078
- mergeron/core/guidelines_boundary_functions.py +1128 -0
- mergeron/core/{guidelines_boundaries_specialized_functions.py → guidelines_boundary_functions_extra.py} +87 -55
- mergeron/core/pseudorandom_numbers.py +16 -22
- mergeron/data/__init__.py +3 -0
- mergeron/data/damodaran_margin_data.xls +0 -0
- mergeron/data/damodaran_margin_data_dict.msgpack +0 -0
- mergeron/demo/__init__.py +3 -0
- mergeron/demo/visualize_empirical_margin_distribution.py +86 -0
- mergeron/gen/__init__.py +258 -246
- mergeron/gen/data_generation.py +473 -224
- mergeron/gen/data_generation_functions.py +876 -0
- mergeron/gen/enforcement_stats.py +355 -0
- mergeron/gen/upp_tests.py +171 -259
- mergeron-2025.739265.0.dist-info/METADATA +115 -0
- mergeron-2025.739265.0.dist-info/RECORD +23 -0
- {mergeron-2024.738953.1.dist-info → mergeron-2025.739265.0.dist-info}/WHEEL +1 -1
- mergeron/License.txt +0 -16
- mergeron/core/InCommon RSA Server CA cert chain.pem +0 -68
- mergeron/core/excel_helper.py +0 -257
- mergeron/core/proportions_tests.py +0 -520
- mergeron/ext/__init__.py +0 -5
- mergeron/ext/tol_colors.py +0 -851
- mergeron/gen/_data_generation_functions_nonpublic.py +0 -623
- mergeron/gen/investigations_stats.py +0 -709
- mergeron/jinja_LaTex_templates/clrrate_cis_summary_table_template.tex.jinja2 +0 -121
- mergeron/jinja_LaTex_templates/ftcinvdata_byhhianddelta_table_template.tex.jinja2 +0 -82
- mergeron/jinja_LaTex_templates/ftcinvdata_summary_table_template.tex.jinja2 +0 -57
- mergeron/jinja_LaTex_templates/ftcinvdata_summarypaired_table_template.tex.jinja2 +0 -104
- mergeron/jinja_LaTex_templates/mergeron.cls +0 -161
- mergeron/jinja_LaTex_templates/mergeron_table_collection_template.tex.jinja2 +0 -90
- mergeron/jinja_LaTex_templates/setup_tikz_tables.tex.jinja2 +0 -84
- mergeron-2024.738953.1.dist-info/METADATA +0 -93
- mergeron-2024.738953.1.dist-info/RECORD +0 -30
- /mergeron/{core → data}/ftc_invdata.msgpack +0 -0
|
@@ -1,34 +1,32 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Methods to parse FTC Merger Investigations Data, downloading source documents
|
|
3
3
|
as necessary
|
|
4
4
|
|
|
5
5
|
NOTES
|
|
6
6
|
-----
|
|
7
|
-
|
|
7
|
+
Reported row and column totals from source data are not stored.
|
|
8
8
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import shutil
|
|
12
12
|
from collections.abc import Mapping, Sequence
|
|
13
|
-
from importlib
|
|
13
|
+
from importlib import resources
|
|
14
14
|
from operator import itemgetter
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from types import MappingProxyType
|
|
17
|
-
from typing import Any, NamedTuple
|
|
17
|
+
from typing import Any, NamedTuple
|
|
18
18
|
|
|
19
|
-
import fitz # type: ignore
|
|
20
19
|
import msgpack # type: ignore
|
|
21
20
|
import msgpack_numpy as m # type: ignore
|
|
22
21
|
import numpy as np
|
|
23
22
|
import re2 as re # type: ignore
|
|
24
|
-
import
|
|
23
|
+
import urllib3
|
|
25
24
|
from bs4 import BeautifulSoup
|
|
26
25
|
from numpy.testing import assert_array_equal
|
|
27
|
-
from numpy.typing import NDArray
|
|
28
26
|
|
|
29
|
-
from .. import _PKG_NAME, DATA_DIR # noqa: TID252
|
|
27
|
+
from .. import _PKG_NAME, DATA_DIR, VERSION, ArrayBIGINT # noqa: TID252
|
|
30
28
|
|
|
31
|
-
__version__ =
|
|
29
|
+
__version__ = VERSION
|
|
32
30
|
|
|
33
31
|
m.patch()
|
|
34
32
|
|
|
@@ -37,11 +35,16 @@ if not FTCDATA_DIR.is_dir():
|
|
|
37
35
|
FTCDATA_DIR.mkdir(parents=True)
|
|
38
36
|
|
|
39
37
|
INVDATA_ARCHIVE_PATH = DATA_DIR / "ftc_invdata.msgpack"
|
|
40
|
-
if
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
38
|
+
if (
|
|
39
|
+
not INVDATA_ARCHIVE_PATH.is_file()
|
|
40
|
+
and (
|
|
41
|
+
_bundled_copy := resources.files(f"{_PKG_NAME}.data").joinpath(
|
|
42
|
+
INVDATA_ARCHIVE_PATH.name
|
|
43
|
+
)
|
|
44
|
+
).is_file()
|
|
45
|
+
):
|
|
46
|
+
with resources.as_file(_bundled_copy) as _bundled_copy_path:
|
|
47
|
+
shutil.copy2(_bundled_copy_path, INVDATA_ARCHIVE_PATH)
|
|
45
48
|
|
|
46
49
|
TABLE_NO_RE = re.compile(r"Table \d+\.\d+")
|
|
47
50
|
TABLE_TYPES = ("ByHHIandDelta", "ByFirmCount")
|
|
@@ -87,16 +90,17 @@ CNT_FCOUNT_DICT = {
|
|
|
87
90
|
|
|
88
91
|
|
|
89
92
|
class INVTableData(NamedTuple):
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
data_array:
|
|
93
|
+
industry_group: str
|
|
94
|
+
additional_evidence: str
|
|
95
|
+
data_array: ArrayBIGINT
|
|
93
96
|
|
|
94
97
|
|
|
95
|
-
INVData
|
|
98
|
+
type INVData = Mapping[str, Mapping[str, Mapping[str, INVTableData]]]
|
|
99
|
+
type _INVData_in = dict[str, dict[str, dict[str, INVTableData]]]
|
|
96
100
|
|
|
97
101
|
|
|
98
102
|
def construct_data(
|
|
99
|
-
_archive_path: Path
|
|
103
|
+
_archive_path: Path = INVDATA_ARCHIVE_PATH,
|
|
100
104
|
*,
|
|
101
105
|
flag_backward_compatibility: bool = True,
|
|
102
106
|
flag_pharma_for_exclusion: bool = True,
|
|
@@ -131,11 +135,11 @@ def construct_data(
|
|
|
131
135
|
A dictionary of merger investigations data keyed to reporting periods
|
|
132
136
|
|
|
133
137
|
"""
|
|
134
|
-
|
|
138
|
+
|
|
135
139
|
if _archive_path.is_file() and not rebuild_data:
|
|
136
140
|
_archived_data = msgpack.unpackb(_archive_path.read_bytes(), use_list=False)
|
|
137
141
|
|
|
138
|
-
_invdata:
|
|
142
|
+
_invdata: _INVData_in = {}
|
|
139
143
|
for _period in _archived_data:
|
|
140
144
|
_invdata[_period] = {}
|
|
141
145
|
for _table_type in _archived_data[_period]:
|
|
@@ -146,7 +150,7 @@ def construct_data(
|
|
|
146
150
|
)
|
|
147
151
|
return MappingProxyType(_invdata)
|
|
148
152
|
|
|
149
|
-
_invdata = dict(
|
|
153
|
+
_invdata = dict(_parse_invdata()) # type: ignore # Convert immutable to mutable
|
|
150
154
|
|
|
151
155
|
# Add some data periods (
|
|
152
156
|
# only periods ending in 2011, others have few observations and
|
|
@@ -162,7 +166,7 @@ def construct_data(
|
|
|
162
166
|
|
|
163
167
|
# Create data for industries with no evidence on entry
|
|
164
168
|
for _data_period in _invdata:
|
|
165
|
-
|
|
169
|
+
_construct_no_evidence_data(_invdata, _data_period)
|
|
166
170
|
|
|
167
171
|
# Create a list of exclusions to named industries in the base period,
|
|
168
172
|
# for construction of aggregate enforcement statistics where feasible
|
|
@@ -182,7 +186,9 @@ def construct_data(
|
|
|
182
186
|
_aggr_tables_list = [
|
|
183
187
|
_t
|
|
184
188
|
for _t in _invdata["1996-2003"][_table_type]
|
|
185
|
-
if re.sub(
|
|
189
|
+
if re.sub(
|
|
190
|
+
r"\W", "", _invdata["1996-2003"][_table_type][_t].industry_group
|
|
191
|
+
)
|
|
186
192
|
not in _industry_exclusion_list
|
|
187
193
|
]
|
|
188
194
|
|
|
@@ -192,42 +198,54 @@ def construct_data(
|
|
|
192
198
|
)
|
|
193
199
|
}
|
|
194
200
|
|
|
195
|
-
_ = INVDATA_ARCHIVE_PATH.write_bytes(msgpack.packb(_invdata))
|
|
201
|
+
_ = INVDATA_ARCHIVE_PATH.write_bytes(msgpack.packb(_invdata)) # pyright: ignore
|
|
196
202
|
|
|
197
203
|
return MappingProxyType(_invdata)
|
|
198
204
|
|
|
199
205
|
|
|
200
|
-
def
|
|
206
|
+
def _construct_no_evidence_data(_invdata: _INVData_in, _data_period: str, /) -> None:
|
|
201
207
|
_invdata_ind_grp = "All Markets"
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
_invdata_sub_evid_cond_conc = _invdata[_data_period]["ByHHIandDelta"]
|
|
205
|
-
_invdata_sub_evid_cond_conc["Table 9.X"] = INVTableData(
|
|
206
|
-
_invdata_ind_grp,
|
|
207
|
-
_invdata_evid_cond,
|
|
208
|
-
np.column_stack((
|
|
209
|
-
_invdata_sub_evid_cond_conc["Table 3.1"].data_array[:, :2],
|
|
208
|
+
_table_nos_map = dict(
|
|
209
|
+
zip(
|
|
210
210
|
(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
211
|
+
"No Entry Evidence",
|
|
212
|
+
"No Evidence on Customer Complaints",
|
|
213
|
+
"No Evidence on Hot Documents",
|
|
214
214
|
),
|
|
215
|
-
)),
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
_invdata_sub_evid_cond_fcount = _invdata[_data_period]["ByFirmCount"]
|
|
219
|
-
_invdata_sub_evid_cond_fcount["Table 10.X"] = INVTableData(
|
|
220
|
-
_invdata_ind_grp,
|
|
221
|
-
_invdata_evid_cond,
|
|
222
|
-
np.column_stack((
|
|
223
|
-
_invdata_sub_evid_cond_fcount["Table 4.1"].data_array[:, :1],
|
|
224
215
|
(
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
216
|
+
{"ByHHIandDelta": "Table 9.X", "ByFirmCount": "Table 10.X"},
|
|
217
|
+
{"ByHHIandDelta": "Table 7.X", "ByFirmCount": "Table 8.X"},
|
|
218
|
+
{"ByHHIandDelta": "Table 5.X", "ByFirmCount": "Table 6.X"},
|
|
228
219
|
),
|
|
229
|
-
|
|
220
|
+
strict=True,
|
|
221
|
+
)
|
|
230
222
|
)
|
|
223
|
+
for _invdata_evid_cond in (
|
|
224
|
+
"No Entry Evidence",
|
|
225
|
+
"No Evidence on Customer Complaints",
|
|
226
|
+
"No Evidence on Hot Documents",
|
|
227
|
+
):
|
|
228
|
+
for _stats_grp in ("ByHHIandDelta", "ByFirmCount"):
|
|
229
|
+
_invdata_sub_evid_cond_conc = _invdata[_data_period][_stats_grp]
|
|
230
|
+
|
|
231
|
+
_dtn = _table_nos_map[_invdata_evid_cond]["ByHHIandDelta"]
|
|
232
|
+
_stn0 = "Table 4.1" if _stats_grp == "ByFirmCount" else "Table 3.1"
|
|
233
|
+
_stn1, _stn2 = (_dtn.replace(".X", f".{_i}") for _i in ("1", "2"))
|
|
234
|
+
|
|
235
|
+
_invdata_sub_evid_cond_conc |= {
|
|
236
|
+
_dtn: INVTableData(
|
|
237
|
+
_invdata_ind_grp,
|
|
238
|
+
_invdata_evid_cond,
|
|
239
|
+
np.column_stack((
|
|
240
|
+
_invdata_sub_evid_cond_conc[_stn0].data_array[:, :2],
|
|
241
|
+
(
|
|
242
|
+
_invdata_sub_evid_cond_conc[_stn0].data_array[:, 2:]
|
|
243
|
+
- _invdata_sub_evid_cond_conc[_stn1].data_array[:, 2:]
|
|
244
|
+
- _invdata_sub_evid_cond_conc[_stn2].data_array[:, 2:]
|
|
245
|
+
),
|
|
246
|
+
)),
|
|
247
|
+
)
|
|
248
|
+
}
|
|
231
249
|
|
|
232
250
|
|
|
233
251
|
def _construct_new_period_data(
|
|
@@ -255,8 +273,8 @@ def _construct_new_period_data(
|
|
|
255
273
|
for _table_no in _invdata_cuml[_table_type]:
|
|
256
274
|
_invdata_cuml_sub_table = _invdata_cuml[_table_type][_table_no]
|
|
257
275
|
_invdata_ind_group, _invdata_evid_cond, _invdata_cuml_array = (
|
|
258
|
-
_invdata_cuml_sub_table.
|
|
259
|
-
_invdata_cuml_sub_table.
|
|
276
|
+
_invdata_cuml_sub_table.industry_group,
|
|
277
|
+
_invdata_cuml_sub_table.additional_evidence,
|
|
260
278
|
_invdata_cuml_sub_table.data_array,
|
|
261
279
|
)
|
|
262
280
|
|
|
@@ -280,7 +298,7 @@ def _construct_new_period_data(
|
|
|
280
298
|
# Consistency here means that the number of investigations reported
|
|
281
299
|
# in each period is no less than the number reported in
|
|
282
300
|
# any prior period.Although the time periods for table 3.2 through 3.5
|
|
283
|
-
# are not the
|
|
301
|
+
# are not the same in the data for 1996-2005 and 1996-2007 as in
|
|
284
302
|
# the data for the other periods, they are nonetheless shorter than
|
|
285
303
|
# the period 1996-2011, and hence the counts reported for 1996-2011
|
|
286
304
|
# cannot be less than those reported in these prior periods. Note that
|
|
@@ -332,7 +350,8 @@ def _construct_new_period_data(
|
|
|
332
350
|
_invdata_cuml_array[:, -3:-1] - _invdata_base_array[:, -3:-1] # type: ignore
|
|
333
351
|
)
|
|
334
352
|
|
|
335
|
-
#
|
|
353
|
+
# # // spellchecker: disable
|
|
354
|
+
# To examine the number of corrected values per table, // spellchecker: disable
|
|
336
355
|
# uncomment the statements below
|
|
337
356
|
# _invdata_array_bld_tbc = where(
|
|
338
357
|
# _invdata_array_bld_enfcls < 0, _invdata_array_bld_enfcls, 0
|
|
@@ -342,6 +361,7 @@ def _construct_new_period_data(
|
|
|
342
361
|
# f"{_data_period}, {_table_no}, {_invdata_ind_group}:",
|
|
343
362
|
# abs(np.einsum('ij->', invdata_array_bld_tbc))
|
|
344
363
|
# )
|
|
364
|
+
# # // spellchecker: disable
|
|
345
365
|
|
|
346
366
|
# Enforce non-negativity
|
|
347
367
|
_invdata_array_bld_enfcls = np.stack((
|
|
@@ -385,21 +405,9 @@ def _invdata_build_aggregate_table(
|
|
|
385
405
|
)
|
|
386
406
|
|
|
387
407
|
|
|
388
|
-
def
|
|
389
|
-
_invdata_docnames: Sequence[str] = (
|
|
390
|
-
"040831horizmergersdata96-03.pdf",
|
|
391
|
-
"p035603horizmergerinvestigationdata1996-2005.pdf",
|
|
392
|
-
"081201hsrmergerdata.pdf",
|
|
393
|
-
"130104horizontalmergerreport.pdf",
|
|
394
|
-
),
|
|
395
|
-
) -> INVData:
|
|
408
|
+
def _parse_invdata() -> INVData:
|
|
396
409
|
"""Parse FTC merger investigations data reports to structured data.
|
|
397
410
|
|
|
398
|
-
Parameters
|
|
399
|
-
----------
|
|
400
|
-
_invdata_docnames
|
|
401
|
-
Names of PDF files reporting FTC merger investigations data.
|
|
402
|
-
|
|
403
411
|
Returns
|
|
404
412
|
-------
|
|
405
413
|
Immutable dictionary of merger investigations data, keyed to
|
|
@@ -408,12 +416,23 @@ def parse_invdata(
|
|
|
408
416
|
by range of HHI and ∆HHI.
|
|
409
417
|
|
|
410
418
|
"""
|
|
419
|
+
raise ValueError(
|
|
420
|
+
"This function is defined here as documentation.\n"
|
|
421
|
+
"NOTE: License for `pymupdf`, upon which this function depends,"
|
|
422
|
+
" may be incompatible with the MIT license,"
|
|
423
|
+
" under which this pacakge is distributed."
|
|
424
|
+
" Making this fumction operable requires the user to modify"
|
|
425
|
+
" the source code as well as to install an additional package"
|
|
426
|
+
" not distributed with this package or included in its dependencies."
|
|
427
|
+
)
|
|
428
|
+
import fitz # type: ignore
|
|
429
|
+
|
|
430
|
+
_invdata_docnames = _download_invdata(FTCDATA_DIR)
|
|
431
|
+
|
|
411
432
|
_invdata: dict[str, dict[str, dict[str, INVTableData]]] = {}
|
|
412
433
|
|
|
413
434
|
for _invdata_docname in _invdata_docnames:
|
|
414
435
|
_invdata_pdf_path = FTCDATA_DIR.joinpath(_invdata_docname)
|
|
415
|
-
if not _invdata_pdf_path.is_file():
|
|
416
|
-
_download_invdata(FTCDATA_DIR)
|
|
417
436
|
|
|
418
437
|
_invdata_fitz = fitz.open(_invdata_pdf_path)
|
|
419
438
|
_invdata_meta = _invdata_fitz.metadata
|
|
@@ -478,7 +497,7 @@ def parse_invdata(
|
|
|
478
497
|
|
|
479
498
|
|
|
480
499
|
def _parse_page_blocks(
|
|
481
|
-
_invdata:
|
|
500
|
+
_invdata: _INVData_in, _data_period: str, _doc_pg_blocks: Sequence[Sequence[Any]], /
|
|
482
501
|
) -> None:
|
|
483
502
|
if _data_period != "1996-2011":
|
|
484
503
|
_parse_table_blocks(_invdata, _data_period, _doc_pg_blocks)
|
|
@@ -505,7 +524,7 @@ def _parse_page_blocks(
|
|
|
505
524
|
|
|
506
525
|
|
|
507
526
|
def _parse_table_blocks(
|
|
508
|
-
_invdata:
|
|
527
|
+
_invdata: _INVData_in, _data_period: str, _table_blocks: Sequence[Sequence[str]], /
|
|
509
528
|
) -> None:
|
|
510
529
|
_invdata_evid_cond = "Unrestricted on additional evidence"
|
|
511
530
|
_table_num, _table_ser, _table_type = _identify_table_type(
|
|
@@ -580,12 +599,12 @@ def _identify_table_type(_tnstr: str = CONC_TABLE_ALL, /) -> tuple[str, int, str
|
|
|
580
599
|
|
|
581
600
|
def _process_table_blks_conc_type(
|
|
582
601
|
_table_blocks: Sequence[Sequence[str]], /
|
|
583
|
-
) ->
|
|
602
|
+
) -> ArrayBIGINT:
|
|
584
603
|
_conc_row_pat = re.compile(r"((?:0|\d,\d{3}) (?:- \d+,\d{3}|\+)|TOTAL)")
|
|
585
604
|
|
|
586
605
|
_col_titles_array = tuple(CONC_DELTA_DICT.values())
|
|
587
|
-
_col_totals:
|
|
588
|
-
_invdata_array:
|
|
606
|
+
_col_totals: ArrayBIGINT = np.zeros(len(_col_titles_array), np.int64)
|
|
607
|
+
_invdata_array: ArrayBIGINT = np.array(None)
|
|
589
608
|
|
|
590
609
|
for _tbl_blk in _table_blocks:
|
|
591
610
|
if _conc_row_pat.match(_blk_str := _tbl_blk[-3]):
|
|
@@ -613,7 +632,7 @@ def _process_table_blks_conc_type(
|
|
|
613
632
|
_col_totals = _row_array
|
|
614
633
|
else:
|
|
615
634
|
_invdata_array = (
|
|
616
|
-
np.
|
|
635
|
+
np.vstack((_invdata_array, _row_array))
|
|
617
636
|
if _invdata_array.shape
|
|
618
637
|
else _row_array
|
|
619
638
|
)
|
|
@@ -637,13 +656,11 @@ def _process_table_blks_conc_type(
|
|
|
637
656
|
|
|
638
657
|
def _process_table_blks_cnt_type(
|
|
639
658
|
_table_blocks: Sequence[Sequence[str]], /
|
|
640
|
-
) ->
|
|
659
|
+
) -> ArrayBIGINT:
|
|
641
660
|
_cnt_row_pat = re.compile(r"(\d+ (?:to \d+|\+)|TOTAL)")
|
|
642
661
|
|
|
643
|
-
_invdata_array:
|
|
644
|
-
_col_totals:
|
|
645
|
-
3, np.int64
|
|
646
|
-
) # "enforced", "closed", "total"
|
|
662
|
+
_invdata_array: ArrayBIGINT = np.array(None)
|
|
663
|
+
_col_totals: ArrayBIGINT = np.zeros(3, np.int64) # "enforced", "closed", "total"
|
|
647
664
|
|
|
648
665
|
for _tbl_blk in _table_blocks:
|
|
649
666
|
if _cnt_row_pat.match(_blk_str := _tbl_blk[-3]):
|
|
@@ -660,7 +677,7 @@ def _process_table_blks_cnt_type(
|
|
|
660
677
|
_col_totals = _row_list
|
|
661
678
|
else:
|
|
662
679
|
_invdata_array = (
|
|
663
|
-
np.
|
|
680
|
+
np.vstack((_invdata_array, _row_list))
|
|
664
681
|
if _invdata_array.shape
|
|
665
682
|
else _row_list
|
|
666
683
|
)
|
|
@@ -676,32 +693,61 @@ def _process_table_blks_cnt_type(
|
|
|
676
693
|
return _invdata_array[np.argsort(_invdata_array[:, 0])]
|
|
677
694
|
|
|
678
695
|
|
|
679
|
-
def _download_invdata(_dl_path: Path) ->
|
|
696
|
+
def _download_invdata(_dl_path: Path = FTCDATA_DIR) -> tuple[str, ...]:
|
|
697
|
+
if not _dl_path.is_dir():
|
|
698
|
+
_dl_path.mkdir(parents=True)
|
|
699
|
+
|
|
680
700
|
_invdata_homepage_urls = (
|
|
681
701
|
"https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2003",
|
|
682
702
|
"https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2005-0",
|
|
683
703
|
"https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2007-0",
|
|
684
704
|
"https://www.ftc.gov/reports/horizontal-merger-investigation-data-fiscal-years-1996-2011",
|
|
685
705
|
)
|
|
686
|
-
_invdata_docnames =
|
|
706
|
+
_invdata_docnames = (
|
|
707
|
+
"040831horizmergersdata96-03.pdf",
|
|
708
|
+
"p035603horizmergerinvestigationdata1996-2005.pdf",
|
|
709
|
+
"081201hsrmergerdata.pdf",
|
|
710
|
+
"130104horizontalmergerreport.pdf",
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
if all(
|
|
714
|
+
_dl_path.joinpath(_invdata_docname).is_file()
|
|
715
|
+
for _invdata_docname in _invdata_docnames
|
|
716
|
+
):
|
|
717
|
+
return _invdata_docnames
|
|
718
|
+
|
|
719
|
+
_invdata_docnames_dl: tuple[str, ...] = ()
|
|
720
|
+
_u3pm = urllib3.PoolManager()
|
|
721
|
+
_chunk_size = 1024 * 1024
|
|
687
722
|
for _invdata_homepage_url in _invdata_homepage_urls:
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
723
|
+
with _u3pm.request(
|
|
724
|
+
"GET", _invdata_homepage_url, preload_content=False
|
|
725
|
+
) as _u3handle:
|
|
726
|
+
_invdata_soup = BeautifulSoup(_u3handle.data, "html.parser")
|
|
727
|
+
_invdata_attrs = [
|
|
728
|
+
(_g.get("title", ""), _g.get("href", ""))
|
|
729
|
+
for _g in _invdata_soup.find_all("a")
|
|
730
|
+
if _g.get("title", "") and _g.get("href", "").endswith(".pdf")
|
|
731
|
+
]
|
|
697
732
|
for _invdata_attr in _invdata_attrs:
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
with
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
733
|
+
_invdata_docname, _invdata_link = _invdata_attr
|
|
734
|
+
_invdata_docnames_dl += (_invdata_docname,)
|
|
735
|
+
with (
|
|
736
|
+
_u3pm.request(
|
|
737
|
+
"GET", f"https://www.ftc.gov/{_invdata_link}", preload_content=False
|
|
738
|
+
) as _urlopen_handle,
|
|
739
|
+
_dl_path.joinpath(_invdata_docname).open("wb") as _invdata_fh,
|
|
740
|
+
):
|
|
741
|
+
while True:
|
|
742
|
+
_data = _urlopen_handle.read(_chunk_size)
|
|
743
|
+
if not _data:
|
|
744
|
+
break
|
|
745
|
+
_invdata_fh.write(_data)
|
|
746
|
+
|
|
747
|
+
return _invdata_docnames_dl
|
|
706
748
|
|
|
707
|
-
|
|
749
|
+
|
|
750
|
+
if __name__ == "__main__":
|
|
751
|
+
print(
|
|
752
|
+
"This module defines functions for downloading and preparing FTC merger investigations data for further analysis."
|
|
753
|
+
)
|