corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,538 @@
1
+ """
2
+ GLEIF data importer for the company database.
3
+
4
+ Imports Legal Entity Identifier (LEI) data from GLEIF files
5
+ into the embedding database for company name matching.
6
+
7
+ Supports:
8
+ - JSON files (API responses, concatenated JSON)
9
+ - XML files (official GLEIF LEI-CDF v3.1 format)
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import xml.etree.ElementTree as ET
15
+ from pathlib import Path
16
+ from typing import Any, Iterator, Optional
17
+
18
+ from ..models import CompanyRecord, EntityType
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # XML namespaces for GLEIF LEI-CDF format
23
+ LEI_NAMESPACES = {
24
+ 'lei': 'http://www.gleif.org/data/schema/leidata/2016',
25
+ }
26
+
27
+ # Mapping from GLEIF EntityCategory to our EntityType
28
+ # See: https://www.gleif.org/en/about-lei/common-data-file-format
29
+ GLEIF_CATEGORY_TO_ENTITY_TYPE: dict[str, EntityType] = {
30
+ "GENERAL": EntityType.BUSINESS, # Regular legal entities (companies)
31
+ "FUND": EntityType.FUND, # Investment funds, ETFs, mutual funds
32
+ "BRANCH": EntityType.BRANCH, # Branch offices of companies
33
+ "SOLE_PROPRIETOR": EntityType.BUSINESS, # Sole proprietorships (still a business)
34
+ "INTERNATIONAL_ORGANIZATION": EntityType.INTERNATIONAL_ORG, # UN, WHO, IMF, etc.
35
+ "": EntityType.UNKNOWN, # Empty/unset
36
+ }
37
+
38
+
39
+ class GleifImporter:
40
+ """
41
+ Importer for GLEIF LEI data.
42
+
43
+ Supports:
44
+ - JSON concatenated files (level1-concatenated.json)
45
+ - Individual JSON records
46
+ - Streaming import for large files
47
+
48
+ Maps GLEIF EntityCategory to EntityType:
49
+ - GENERAL -> BUSINESS
50
+ - FUND -> FUND
51
+ - BRANCH -> BRANCH
52
+ - SOLE_PROPRIETOR -> BUSINESS
53
+ - INTERNATIONAL_ORGANIZATION -> INTERNATIONAL_ORG
54
+ """
55
+
56
+ def __init__(self, active_only: bool = True):
57
+ """
58
+ Initialize the GLEIF importer.
59
+
60
+ Args:
61
+ active_only: Only import ACTIVE entities (default True)
62
+ """
63
+ self._active_only = active_only
64
+
65
+ def import_from_file(
66
+ self,
67
+ file_path: str | Path,
68
+ limit: Optional[int] = None,
69
+ ) -> Iterator[CompanyRecord]:
70
+ """
71
+ Import records from a GLEIF file.
72
+
73
+ Supports:
74
+ - XML files (official GLEIF LEI-CDF v3.1 format)
75
+ - JSON array files
76
+ - Concatenated JSON files (one object per line)
77
+
78
+ Args:
79
+ file_path: Path to GLEIF file (XML or JSON)
80
+ limit: Optional limit on number of records
81
+
82
+ Yields:
83
+ CompanyRecord for each valid entity
84
+ """
85
+ file_path = Path(file_path)
86
+
87
+ if not file_path.exists():
88
+ raise FileNotFoundError(f"GLEIF file not found: {file_path}")
89
+
90
+ logger.info(f"Importing GLEIF data from {file_path}")
91
+
92
+ # Detect file format by extension or content
93
+ if file_path.suffix.lower() == ".xml":
94
+ yield from self._import_xml_streaming(file_path, limit)
95
+ else:
96
+ # Try to detect JSON format
97
+ with open(file_path, "r", encoding="utf-8") as f:
98
+ first_char = f.read(1)
99
+
100
+ if first_char == "<":
101
+ # XML content
102
+ yield from self._import_xml_streaming(file_path, limit)
103
+ elif first_char == "[":
104
+ # JSON array format
105
+ yield from self._import_json_array(file_path, limit)
106
+ else:
107
+ # Concatenated JSON format (one object per line)
108
+ yield from self._import_concatenated_json(file_path, limit)
109
+
110
+ def _import_json_array(
111
+ self,
112
+ file_path: Path,
113
+ limit: Optional[int],
114
+ ) -> Iterator[CompanyRecord]:
115
+ """Import from JSON array format."""
116
+ with open(file_path, "r", encoding="utf-8") as f:
117
+ data = json.load(f)
118
+
119
+ records = data if isinstance(data, list) else data.get("data", [])
120
+ count = 0
121
+
122
+ for raw_record in records:
123
+ if limit and count >= limit:
124
+ break
125
+
126
+ record = self._parse_record(raw_record)
127
+ if record:
128
+ count += 1
129
+ yield record
130
+
131
+ if count % 10000 == 0:
132
+ logger.info(f"Imported {count} GLEIF records")
133
+
134
+ logger.info(f"Completed GLEIF import: {count} records")
135
+
136
+ def _import_concatenated_json(
137
+ self,
138
+ file_path: Path,
139
+ limit: Optional[int],
140
+ ) -> Iterator[CompanyRecord]:
141
+ """Import from concatenated JSON format (one object per line)."""
142
+ count = 0
143
+
144
+ with open(file_path, "r", encoding="utf-8") as f:
145
+ for line in f:
146
+ if limit and count >= limit:
147
+ break
148
+
149
+ line = line.strip()
150
+ if not line:
151
+ continue
152
+
153
+ try:
154
+ raw_record = json.loads(line)
155
+ record = self._parse_record(raw_record)
156
+ if record:
157
+ count += 1
158
+ yield record
159
+
160
+ if count % 10000 == 0:
161
+ logger.info(f"Imported {count} GLEIF records")
162
+
163
+ except json.JSONDecodeError as e:
164
+ logger.debug(f"Failed to parse line: {e}")
165
+ continue
166
+
167
+ logger.info(f"Completed GLEIF import: {count} records")
168
+
169
+ def _import_xml_streaming(
170
+ self,
171
+ file_path: Path,
172
+ limit: Optional[int],
173
+ ) -> Iterator[CompanyRecord]:
174
+ """
175
+ Import from XML file using streaming parser.
176
+
177
+ Uses iterparse for memory-efficient parsing of large XML files.
178
+ """
179
+ logger.info(f"Starting streaming XML parse of {file_path}")
180
+ count = 0
181
+
182
+ try:
183
+ context = ET.iterparse(str(file_path), events=('end',))
184
+
185
+ for event, elem in context:
186
+ # Look for LEIRecord elements (handle both namespaced and non-namespaced)
187
+ if elem.tag.endswith('LEIRecord'):
188
+ if limit and count >= limit:
189
+ break
190
+
191
+ record = self._parse_xml_record(elem)
192
+ if record:
193
+ count += 1
194
+ yield record
195
+
196
+ if count % 10000 == 0:
197
+ logger.info(f"Parsed {count} XML records")
198
+
199
+ # Clear element to free memory
200
+ elem.clear()
201
+
202
+ logger.info(f"Completed XML import: {count} records")
203
+
204
+ except ET.ParseError as e:
205
+ logger.error(f"XML parsing error: {e}")
206
+ raise ValueError(f"Failed to parse XML file {file_path}: {e}")
207
+
208
+ def _parse_xml_record(self, lei_record: ET.Element) -> Optional[CompanyRecord]:
209
+ """Parse a single LEI record from XML."""
210
+ try:
211
+ # Helper to find elements with or without namespace
212
+ def find_text(parent: ET.Element, tag: str) -> Optional[str]:
213
+ # Try with namespace first
214
+ elem = parent.find(f'.//lei:{tag}', LEI_NAMESPACES)
215
+ if elem is None:
216
+ # Try without namespace
217
+ elem = parent.find(f'.//{tag}')
218
+ return elem.text if elem is not None else None
219
+
220
+ def find_elem(parent: ET.Element, tag: str) -> Optional[ET.Element]:
221
+ elem = parent.find(f'.//lei:{tag}', LEI_NAMESPACES)
222
+ if elem is None:
223
+ elem = parent.find(f'.//{tag}')
224
+ return elem
225
+
226
+ # Get LEI
227
+ lei = find_text(lei_record, 'LEI')
228
+ if not lei or len(lei) != 20:
229
+ return None
230
+
231
+ # Get Entity element
232
+ entity_elem = find_elem(lei_record, 'Entity')
233
+ if entity_elem is None:
234
+ return None
235
+
236
+ # Get legal name
237
+ legal_name = find_text(entity_elem, 'LegalName')
238
+ if not legal_name:
239
+ return None
240
+
241
+ # Get status - skip inactive if configured
242
+ status = find_text(lei_record, 'EntityStatus')
243
+ if self._active_only and status and status.upper() != 'ACTIVE':
244
+ return None
245
+
246
+ # Get entity category and map to EntityType
247
+ entity_category = find_text(entity_elem, 'EntityCategory') or ""
248
+ entity_type = GLEIF_CATEGORY_TO_ENTITY_TYPE.get(
249
+ entity_category.upper(),
250
+ GLEIF_CATEGORY_TO_ENTITY_TYPE.get(entity_category, EntityType.UNKNOWN)
251
+ )
252
+
253
+ # Get jurisdiction
254
+ jurisdiction = find_text(entity_elem, 'LegalJurisdiction')
255
+
256
+ # Get address info
257
+ legal_address = find_elem(entity_elem, 'LegalAddress')
258
+ country = ""
259
+ city = ""
260
+ if legal_address is not None:
261
+ country = find_text(legal_address, 'Country') or ""
262
+ city = find_text(legal_address, 'City') or ""
263
+
264
+ # Get other names
265
+ other_names = []
266
+ other_names_elem = find_elem(entity_elem, 'OtherEntityNames')
267
+ if other_names_elem is not None:
268
+ for name_elem in other_names_elem:
269
+ if name_elem.text:
270
+ other_names.append(name_elem.text)
271
+
272
+ # Build record
273
+ name = legal_name.strip()
274
+ record_data = {
275
+ "lei": lei,
276
+ "legal_name": legal_name,
277
+ "status": status,
278
+ "jurisdiction": jurisdiction,
279
+ "country": country,
280
+ "city": city,
281
+ "entity_category": entity_category,
282
+ "other_names": other_names,
283
+ }
284
+
285
+ return CompanyRecord(
286
+ name=name,
287
+ source="gleif",
288
+ source_id=lei,
289
+ region=country,
290
+ entity_type=entity_type,
291
+ record=record_data,
292
+ )
293
+
294
+ except Exception as e:
295
+ logger.debug(f"Failed to parse XML record: {e}")
296
+ return None
297
+
298
+ def _parse_record(self, raw: dict[str, Any]) -> Optional[CompanyRecord]:
299
+ """
300
+ Parse a raw GLEIF record into a CompanyRecord.
301
+
302
+ Handles both API response format and bulk file format.
303
+ """
304
+ try:
305
+ # Handle nested structure from API or bulk files
306
+ attrs = raw.get("attributes", raw)
307
+ entity = attrs.get("entity", attrs)
308
+
309
+ # Get status - skip inactive if configured
310
+ registration = attrs.get("registration", {})
311
+ status = registration.get("status") or entity.get("status") or raw.get("status")
312
+ if self._active_only and status and status.upper() != "ACTIVE":
313
+ return None
314
+
315
+ # Get entity category and map to EntityType
316
+ entity_category = entity.get("category") or entity.get("EntityCategory") or ""
317
+ entity_type = GLEIF_CATEGORY_TO_ENTITY_TYPE.get(
318
+ entity_category.upper(),
319
+ GLEIF_CATEGORY_TO_ENTITY_TYPE.get(entity_category, EntityType.UNKNOWN)
320
+ )
321
+
322
+ # Get LEI
323
+ lei = raw.get("id") or attrs.get("lei") or raw.get("LEI")
324
+ if not lei:
325
+ return None
326
+
327
+ # Get legal name - handle GLEIF JSON format with nested "$" key
328
+ legal_name_obj = entity.get("legalName", {})
329
+ if isinstance(legal_name_obj, dict):
330
+ legal_name = legal_name_obj.get("name") or legal_name_obj.get("$", "")
331
+ else:
332
+ legal_name = legal_name_obj or ""
333
+
334
+ if not legal_name:
335
+ # Try alternative locations
336
+ legal_name = entity.get("LegalName") or raw.get("legal_name") or ""
337
+
338
+ if not legal_name:
339
+ return None
340
+
341
+ # Get other names for better matching
342
+ other_names = []
343
+ other_names_list = entity.get("otherNames", []) or entity.get("OtherEntityNames", [])
344
+ for other in other_names_list:
345
+ if isinstance(other, dict):
346
+ name = other.get("name") or other.get("$", "")
347
+ else:
348
+ name = str(other)
349
+ if name:
350
+ other_names.append(name)
351
+
352
+ # Use legal name as primary, but store others in record
353
+ name = legal_name.strip()
354
+
355
+ # Get jurisdiction and address info
356
+ jurisdiction = entity.get("jurisdiction") or entity.get("LegalJurisdiction")
357
+ legal_address = entity.get("legalAddress", {})
358
+ if isinstance(legal_address, dict):
359
+ country = legal_address.get("country") or legal_address.get("Country", "")
360
+ city = legal_address.get("city") or legal_address.get("City", "")
361
+ else:
362
+ country = ""
363
+ city = ""
364
+
365
+ # Build record with relevant data
366
+ record_data = {
367
+ "lei": lei,
368
+ "legal_name": legal_name,
369
+ "status": status,
370
+ "jurisdiction": jurisdiction,
371
+ "country": country,
372
+ "city": city,
373
+ "entity_category": entity_category,
374
+ "other_names": other_names,
375
+ }
376
+
377
+ return CompanyRecord(
378
+ name=name,
379
+ source="gleif",
380
+ source_id=lei,
381
+ region=country,
382
+ entity_type=entity_type,
383
+ record=record_data,
384
+ )
385
+
386
+ except Exception as e:
387
+ logger.debug(f"Failed to parse GLEIF record: {e}")
388
+ return None
389
+
390
+ def get_latest_file_info(self) -> dict[str, Any]:
391
+ """
392
+ Get information about the latest GLEIF LEI file.
393
+
394
+ Returns:
395
+ Dict with file metadata including 'id', 'publish_date', 'record_count'
396
+ """
397
+ import urllib.request
398
+
399
+ # GLEIF API to list available concatenated files
400
+ api_url = "https://leidata.gleif.org/api/v1/concatenated-files/lei2"
401
+
402
+ logger.info("Checking for latest GLEIF data file...")
403
+
404
+ req = urllib.request.Request(
405
+ api_url,
406
+ headers={"Accept": "application/json"}
407
+ )
408
+
409
+ with urllib.request.urlopen(req) as response:
410
+ data = json.loads(response.read().decode("utf-8"))
411
+
412
+ # The API returns files sorted by date, most recent first
413
+ files = data.get("data", [])
414
+ if not files:
415
+ raise RuntimeError("No GLEIF files available from API")
416
+
417
+ latest = files[0]
418
+ file_id = latest.get("id")
419
+ # Fields are at top level, not nested under "attributes"
420
+ record_count = latest.get("record_count")
421
+ content_date = latest.get("content_date")
422
+
423
+ info = {
424
+ "id": file_id,
425
+ "publish_date": content_date,
426
+ "record_count": record_count,
427
+ "cdf_version": latest.get("cdf_version"),
428
+ }
429
+
430
+ record_str = f"{record_count:,}" if record_count else "unknown"
431
+ logger.info(
432
+ f"Latest GLEIF file: ID={file_id}, "
433
+ f"date={content_date}, "
434
+ f"records={record_str}"
435
+ )
436
+
437
+ return info
438
+
439
+ def download_latest(
440
+ self,
441
+ output_path: Optional[Path] = None,
442
+ force: bool = False,
443
+ ) -> Path:
444
+ """
445
+ Download the latest GLEIF data file.
446
+
447
+ Automatically fetches the most recent file from GLEIF's API.
448
+ Caches downloads and skips re-downloading if the same file ID exists.
449
+
450
+ Args:
451
+ output_path: Where to save the file (default: temp directory)
452
+ force: Force re-download even if cached
453
+
454
+ Returns:
455
+ Path to downloaded file
456
+ """
457
+ import shutil
458
+ import tempfile
459
+ import urllib.request
460
+ import zipfile
461
+
462
+ # Get latest file info from API
463
+ file_info = self.get_latest_file_info()
464
+ file_id = file_info["id"]
465
+
466
+ # Set up output directory and paths
467
+ if output_path is None:
468
+ output_dir = Path(tempfile.gettempdir()) / "gleif"
469
+ output_dir.mkdir(parents=True, exist_ok=True)
470
+ output_path = output_dir / "lei-records.xml"
471
+ else:
472
+ output_dir = output_path.parent
473
+
474
+ # Check for cached version using metadata file
475
+ metadata_path = output_dir / "gleif_metadata.json"
476
+ if not force and output_path.exists() and metadata_path.exists():
477
+ try:
478
+ with open(metadata_path, "r") as f:
479
+ cached_metadata = json.load(f)
480
+ if cached_metadata.get("file_id") == file_id:
481
+ logger.info(
482
+ f"Using cached GLEIF data (file ID: {file_id}, "
483
+ f"date: {cached_metadata.get('publish_date')})"
484
+ )
485
+ return output_path
486
+ except (json.JSONDecodeError, IOError):
487
+ pass # Metadata corrupted, re-download
488
+
489
+ # Build download URL for the latest file
490
+ url = f"https://leidata.gleif.org/api/v1/concatenated-files/lei2/get/{file_id}/zip"
491
+
492
+ logger.info(f"Downloading GLEIF data (file ID: {file_id}) from {url}")
493
+
494
+ # Download ZIP file
495
+ zip_path = output_path.with_suffix(".zip")
496
+ urllib.request.urlretrieve(url, zip_path)
497
+
498
+ # Extract data file from ZIP (XML or JSON)
499
+ with zipfile.ZipFile(zip_path, "r") as zf:
500
+ extracted = False
501
+ for name in zf.namelist():
502
+ # Prefer XML (official format), fall back to JSON
503
+ if name.endswith(".xml") or name.endswith(".json"):
504
+ logger.info(f"Extracting {name}...")
505
+ # Update output path extension to match extracted file
506
+ if name.endswith(".xml"):
507
+ output_path = output_path.with_suffix(".xml")
508
+ else:
509
+ output_path = output_path.with_suffix(".json")
510
+ with zf.open(name) as src, open(output_path, "wb") as dst:
511
+ shutil.copyfileobj(src, dst)
512
+ extracted = True
513
+ break
514
+
515
+ if not extracted:
516
+ raise RuntimeError(f"No XML or JSON file found in GLEIF ZIP archive")
517
+
518
+ # Clean up ZIP
519
+ zip_path.unlink()
520
+
521
+ # Save metadata for caching
522
+ metadata = {
523
+ "file_id": file_id,
524
+ "publish_date": file_info.get("publish_date"),
525
+ "record_count": file_info.get("record_count"),
526
+ "downloaded_at": str(Path(output_path).stat().st_mtime),
527
+ "output_path": str(output_path),
528
+ }
529
+ with open(metadata_path, "w") as f:
530
+ json.dump(metadata, f)
531
+
532
+ record_count = file_info.get('record_count')
533
+ record_str = f"{record_count:,}" if record_count else "unknown"
534
+ logger.info(
535
+ f"Downloaded GLEIF data to {output_path} "
536
+ f"(published: {file_info['publish_date']}, records: {record_str})"
537
+ )
538
+ return output_path