PyPI - warn-scraper - Versions diffs - 1.2.117__py3-none-any.whl → 1.2.118__py3-none-any.whl - Mend

warn-scraper 1.2.117py3-none-any.whl → 1.2.118py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

warn/scrapers/ky.py CHANGED Viewed

@@ -60,34 +60,62 @@ def scrape(
     # Open it up
     workbook = load_workbook(filename=latest_path)
-    dirty_list: list = []
+    crosswalk = {
+        "Closure or Layoff?": "closure_or_layoff",
+        "Company Name": "company",
+        "Company: Company Name": "company",
+        "County": "county",
+        "Date Received": "date_received",
+        "Employees": "employees",
+        "NAICS": "NAICS",
+        "NAICS Code": "NAICS",
+        "Notice Link": "notice_url",
+        "Notice Type": "source",
+        "Notice URL": "notice_url",
+        "Notice: Notice Number": "notice_number",
+        "Number of Employees Affected": "employees",
+        "Projected Date": "date_effective",
+        "Region": "region",
+        "Trade": "trade",
+        "Type of Employees Affected": "union_affected",
+        "Workforce Board": "region",
+        "address": "address",
+        "comments": "comments",
+        "congressional": "congressional",
+        "contact": "contact",
+        "industry": "industry",
+        "neg": "neg",  # Rarely seen, only in historical data, maybe all with "N"
+        "occupations": "industry",
+        "source": "source",
+        "union": "union",  # Unclear if different than types of employees affected/union_affected
+    }
+    masterlist: list = []
     for sheet in workbook.worksheets:
         localrows = parse_xlsx(sheet)
-        dirty_list.extend(localrows)
-    headers = dirty_list[0]
-    row_list = []
-    for rowindex, row in enumerate(dirty_list):
-        if (
-            row != headers
-        ):  # Filter out headers, but also double-check when headers may change
-            if row[0] == "Date Received":
-                logger.debug(
-                    f"Dropping dirty row that doesn't quite match headers in row {rowindex}"
-                )
-                logger.debug(f"Want: {headers}")
-                logger.debug(f"Got : {row}")
+        # Traverse each tab. Assume the first line is a header. Check if the second line is bogus.
+        # Build a list of dicts.
+        localheadersraw: list = localrows[0]
+        localheaders: list = []
+        for entry in localheadersraw:
+            if entry not in crosswalk:
+                logger.error(f"Potential header {entry} not found in crosswalk.")
             else:
-                line = {}
-                for i, fieldname in enumerate(headers):
+                localheaders.append(crosswalk[entry])
+        for row in localrows[1:]:  # Skip the header row
+            if row[0] != "Date Received":  # Check for fake second header
+                line: dict = {}
+                for i, fieldname in enumerate(localheaders):
                     line[fieldname] = row[i]
-                row_list.append(line)
-    # dirty_list = None
-    logger.debug(
-        f"Successfully merged {len(row_list)-1:,} records from new spreadsheet."
-    )
+                    if isinstance(row[i], str):
+                        line[fieldname] = row[i].strip()
+                masterlist.append(line)
-    # Need to double-check this archived file code, and make sure headers match
+    logger.debug(f"Successfully merged {len(masterlist)} records from new spreadsheet.")
+    # Earlier versions of this code needed the archived data to match the new data.
+    # We can no longer expect that since October 2025 data revisions.
     archive_url = "https://storage.googleapis.com/bln-data-public/warn-layoffs/ky-historical-normalized.csv"
@@ -96,24 +124,25 @@ def scrape(
     reader = list(csv.reader(r.text.splitlines()))
-    headerlength = len(headers)
-    assert reader[0][:headerlength] == headers
-    logger.debug(
-        f"Historical data matches current headers. Merging {len(reader)-1:,} records."
-    )
+    localheadersraw = reader[0]
+    localheaders: list = []  # type: ignore
+    for entry in localheadersraw:
+        if entry not in crosswalk:
+            logger.error(f"Cannot match possible header value of {entry} to crosswalk.")
+        else:
+            localheaders.append(crosswalk[entry])
     for row in reader[1:]:  # Skip header row
-        line = {}
-        for i, item in enumerate(headers):
-            line[item] = row[
-                i
-            ]  # Make this a list of dictionaries to match earlier input
-        row_list.append(line)
+        line: dict = {}  # type: ignore
+        for i, fieldname in enumerate(localheaders):
+            line[fieldname] = row[i]
+            if isinstance(row[i], str):
+                line[fieldname] = row[i].strip()
+        masterlist.append(line)
+    logger.debug("Historical records folded in.")
     # Write out the results
     data_path = data_dir / "ky.csv"
-    utils.write_dict_rows_to_csv(data_path, headers, row_list, extrasaction="ignore")
+    utils.write_disparate_dict_rows_to_csv(data_path, masterlist)
     # Pass it out
     return data_path

warn/utils.py CHANGED Viewed

@@ -232,6 +232,41 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r
             writer.writerow(row)
+def write_disparate_dict_rows_to_csv(output_path, rows, mode="w"):
+    """Write the provided list of dictionaries to the provided path as comma-separated values, while determining a header.
+    Args:
+        output_path (Path): the Path were the result will be saved
+        rows (list): the list of dictionaries to be saved; can have disparate dict keys
+        mode (str): the mode to be used when opening the file (default 'w')
+    """
+    create_directory(output_path, is_file=True)
+    headers: set = set()  # Get all the potential header names
+    for row in rows:
+        for item in row:
+            headers.add(item)
+    headers = list(sorted(headers))
+    logger.debug(f"Found {len(headers):,} header entries in list of dicts.")
+    logger.debug(f"Writing {len(rows)} rows to {output_path}")
+    with open(output_path, mode, newline="") as outfile:
+        # Create the writer object
+        writer = csv.writer(outfile)
+        # If we are writing a new row ...
+        if mode == "w":
+            # ... drop in the headers
+            writer.writerow(headers)
+        # Loop through the dicts and write them in one by one.
+        for row in rows:
+            line = {}
+            for item in headers:
+                if item in row:
+                    line[item] = row[item]
+                else:
+                    line[item] = None
+            writer.writerow(list(line.values()))
+    return
 def get_all_scrapers():
     """Get all the states and territories that have scrapers.

{warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warn-scraper
-Version: 1.2.117
+Version: 1.2.118
 Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
 Home-page: https://github.com/biglocalnews/warn-scraper
 Author: Big Local News

{warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/RECORD RENAMED Viewed

@@ -17,7 +17,7 @@ warn/__init__.py,sha256=A07JFY1TyaPtVIndBa7IvTk13DETqIkLgRdk0A-MCoE,85
 warn/cache.py,sha256=hyta04_G-ALGwcKl4xNc7EgHS_xklyVD5d8SXNrJekY,5520
 warn/cli.py,sha256=ZqyJwICdHFkn2hEgbArj_upbElR9-TSDlYDqyEGeexE,2019
 warn/runner.py,sha256=oeGRybGwpnkQKlPzRMlKxhsDt1GN4PZoX-vUwrsPgos,1894
-warn/utils.py,sha256=Jd1pIVtfUXxDweKa_6vHTNX13E47Ms7FHSw110unDHk,10408
+warn/utils.py,sha256=67ltJ1ZDCqLfZoFcI8kp5BaTbv28ZzOfzaDvlbnVBfM,11821
 warn/platforms/__init__.py,sha256=wIZRDf4tbTuC8oKM4ZrTAtwNgbtMQGzPXMwDYCFyrog,81
 warn/platforms/job_center/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 warn/platforms/job_center/cache.py,sha256=yhA3sE46lNFg8vEewSoRYVByi0YSlkBiKm7qoSUiTdM,1868
@@ -41,7 +41,7 @@ warn/scrapers/id.py,sha256=qJLcLgCgAfKzLpuwW32JqNwXn9NxZRZvQ50nZZKUhmE,6674
 warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
 warn/scrapers/in.py,sha256=dAT40ROhhKiwLcwa_YJ6EyhsYBLe0IX2rOWXmNa6JMs,2026
 warn/scrapers/ks.py,sha256=F_3biEMF7zgCX2XVuUACR74Vyzapta4SaM9SY3EuZCU,1266
-warn/scrapers/ky.py,sha256=XjIojMpaoKbypa7l23IybP02jBijBCJG5UGqfO-EYjg,4365
+warn/scrapers/ky.py,sha256=IDIzULH5h-UqGCvKvvipYbi5Gg3_rmue_o9SgF7QWqs,5843
 warn/scrapers/la.py,sha256=ORkMOQErl33SEiagOli4agDLdTt0R1MxxBmqOg3hNv8,13175
 warn/scrapers/md.py,sha256=hwgxXQnhyBWm8qF1dvxIThAX1MkrZbXLwRI9inO5t8g,4060
 warn/scrapers/me.py,sha256=q36F4yJ7hvZsLayA3uBS1romo4X3Qf-sEi2Y7LAQCi8,1172
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=7Nle7qL0VNPiE653XyaP9HQqSfuJFDRr2kEkjOqLvFM,11269
 warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
 warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
 warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
-warn_scraper-1.2.117.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
-warn_scraper-1.2.117.dist-info/METADATA,sha256=9kVD8A4-38ukBMhwfAYKSP5AaS6HUeei6Vw6badi9DQ,2385
-warn_scraper-1.2.117.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
-warn_scraper-1.2.117.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
-warn_scraper-1.2.117.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
-warn_scraper-1.2.117.dist-info/RECORD,,
+warn_scraper-1.2.118.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
+warn_scraper-1.2.118.dist-info/METADATA,sha256=K3igkvyJ1uKPecz9UpRkx75hq1MN_GYy3GDRYHzX24k,2385
+warn_scraper-1.2.118.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+warn_scraper-1.2.118.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
+warn_scraper-1.2.118.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
+warn_scraper-1.2.118.dist-info/RECORD,,

{warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/WHEEL RENAMED Viewed

File without changes

{warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/top_level.txt RENAMED Viewed

File without changes

warn-scraper 1.2.117__py3-none-any.whl → 1.2.118__py3-none-any.whl

warn-scraper 1.2.117py3-none-any.whl → 1.2.118py3-none-any.whl