PyPI - warn-scraper - Versions diffs - 1.2.101__py3-none-any.whl → 1.2.103__py3-none-any.whl - Mend

warn-scraper 1.2.101py3-none-any.whl → 1.2.103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

warn/scrapers/co.py CHANGED Viewed

@@ -105,6 +105,7 @@ def scrape(
     # Clean up the headers
     header_crosswalk = {
+        "Name": "company",
         "Company Name": "company",
         "Company": "company",
         "WARN Date": "notice_date",

warn/scrapers/ct.py CHANGED Viewed

@@ -70,7 +70,9 @@ def scrape(
         # Parse out the table
         soup = BeautifulSoup(html, "html.parser")
-        if year == 2016:
+        if year >= 2025:
+            table = soup.find_all("table", "style30")
+        elif year == 2016:
             table = soup.find_all("table", "style15")
         else:
             table = soup.find_all("table", "MsoNormalTable")

warn/scrapers/ny.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import logging
 from pathlib import Path
-from bs4 import BeautifulSoup
-from openpyxl import load_workbook
 from .. import utils
 from ..cache import Cache
-__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire"]
-__tags__ = ["historical", "excel"]
+# from bs4 import BeautifulSoup
+# from openpyxl import load_workbook
+__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire", "stucka"]
+__tags__ = ["historical", "excel", "html"]
 __source__ = {
     "name": "New York Department of Labor",
     "url": "https://dol.ny.gov/warn-notices",
@@ -32,99 +33,33 @@ def scrape(
     """
     cache = Cache(cache_dir)
-    # Get the latest HTML page
-    url_list = [
-        dict(year=2023, url="https://dol.ny.gov/warn-notices"),
-        dict(year=2022, url="https://dol.ny.gov/2022-warn-notices"),
-        dict(year=2021, url="https://dol.ny.gov/warn-notices-2021"),
-    ]
+    """
+    In 2025 New York shifte from a collection of Excel and HTML to something in Tableau. Tableau notes:
+    Find a new landing page for a data page, now done in Tableau: https://dol.ny.gov/warn-dashboard
+    Scroll down and there's a "View in Tableau Public" I don't remember clicking
+    Opens in new tab at https://public.tableau.com/app/profile/kylee.teague2482/viz/WorkerAdjustmentRetrainingNotificationWARN/WARN
+    Append .csv to the end of that URL:
+    https://public.tableau.com/app/profile/kylee.teague2482/viz/WorkerAdjustmentRetrainingNotificationWARN/WARN.csv
+    Try it in requests, no good. Try it in browser again. File downloads. Find it in the downloads section of the browser. Right-click, copy download link, try that in requests and ... it worked?
+    """
-    # Loop through the urls and get the stuff
-    html_row_list = []
-    for config in url_list:
-        html_row_list += _get_html_data(cache, config)
+    url = "https://public.tableau.com/views/WorkerAdjustmentRetrainingNotificationWARN/WARN.csv?%3Adisplay_static_image=y&%3AbootstrapWhenNotified=true&%3Aembed=true&%3Alanguage=en-US&:embed=y&:showVizHome=n&:apiID=host0#navType=0&navSrc=Parse"
-    # Get the historical static data file
-    excel_row_list = _get_historical_data(cache)
+    csv_file = "ny/tableau.csv"
+    cache.download(csv_file, url)
+    mydata = cache.read_csv(csv_file)
     # Set the export path
     data_path = data_dir / "ny.csv"
     # Combine and write out the file
-    fieldnames = list(html_row_list[0].keys()) + list(excel_row_list[0].keys())
-    row_list = html_row_list + excel_row_list
-    utils.write_dict_rows_to_csv(
-        data_path,
-        fieldnames,
-        row_list,
-        extrasaction="ignore",
-    )
+    utils.write_rows_to_csv(data_path, mydata)
     # Return the path to the file
     return data_path
-def _get_html_data(cache, config):
-    r = utils.get_url(config["url"])
-    html = r.text
-    # Save it to the cache
-    cache.write(f"ny/{config['year']}.html", html)
-    # Parse the HTML and grab our table
-    soup = BeautifulSoup(html, "html.parser")
-    table = soup.find("div", class_="landing-paragraphs").find("table")
-    row_list = []
-    # Loop through the rows of the table
-    for tr in table.find_all("tr")[1:]:
-        td_list = tr.find_all("td")
-        d = dict(
-            company_name=td_list[0].a.text,
-            notice_url=td_list[0].a["href"],
-            date_posted=td_list[1].text,
-            notice_dated=td_list[2].text,
-        )
-        row_list.append(d)
-    return row_list
-def _get_historical_data(cache):
-    # Request the page and save it to the cache
-    url = (
-        "https://storage.googleapis.com/bln-data-public/warn-layoffs/ny_historical.xlsx"
-    )
-    excel_path = cache.download("ny/source.xlsx", url)
-    # Open it up
-    workbook = load_workbook(filename=excel_path)
-    # Get the first sheet
-    worksheet = workbook.worksheets[0]
-    # Convert the sheet to a list of lists
-    row_list = []
-    for r in worksheet.rows:
-        column = [cell.value for cell in r]
-        row_list.append(column)
-    # Transform this into a list of dictionaries with headers as keys
-    header_list = row_list.pop(0)
-    dict_list = []
-    for row in row_list:
-        d = {}
-        for i, cell in enumerate(row):
-            key = header_list[i]
-            # Skip any columns where the header is null
-            if key is None:
-                continue
-            d[key] = cell
-        dict_list.append(d)
-    # Return the list of dicts
-    return dict_list
 if __name__ == "__main__":
     scrape()

{warn_scraper-1.2.101.dist-info → warn_scraper-1.2.103.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warn-scraper
-Version: 1.2.101
+Version: 1.2.103
 Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
 Home-page: https://github.com/biglocalnews/warn-scraper
 Author: Big Local News

{warn_scraper-1.2.101.dist-info → warn_scraper-1.2.103.dist-info}/RECORD RENAMED Viewed

@@ -29,8 +29,8 @@ warn/scrapers/ak.py,sha256=h7BYMTV0whwWAPhbzVDVKMMoVCFphKly70aiTHabPq4,1847
 warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
 warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
 warn/scrapers/ca.py,sha256=VQOfjHXPCc-jYwh-EPGVVfnzvXB7pdmCt2uJ6QnMPRM,8600
-warn/scrapers/co.py,sha256=Zpz96Ftqti3Rx-vJEgSUAbvYmbHJ-w0tnt7mQ4FL6CA,7851
-warn/scrapers/ct.py,sha256=HLMmBSFhT5Y3vZQUwRyCTxiG5BMQXTfG3SEj5rkQEL4,4771
+warn/scrapers/co.py,sha256=hUfqrzlhXQBkP4vxewVRrMZrgInoLer5S2MZlyYIQE4,7878
+warn/scrapers/ct.py,sha256=PKeZtlB0-z2wCmYmGl_WYoVo2gzwKV36upZcJVaJxjM,4852
 warn/scrapers/dc.py,sha256=_sHLnVqK_W90QqJb_W88yDlgPjoMl63LYZP3CJfdN9g,4484
 warn/scrapers/de.py,sha256=GyM92A-lFwZAfRxgbO-sIWhRfmBEKirzchaPIv-u0o4,1364
 warn/scrapers/fl.py,sha256=YJ6Qt-jJZ7_iUKlHDaQuaV2gRmae8AJKS5dwwChadBE,9563
@@ -51,7 +51,7 @@ warn/scrapers/mt.py,sha256=t2MP4OCcuCEnrnvNgOu289P0eekZq4XaCK65qzgZX88,2457
 warn/scrapers/ne.py,sha256=JawuGJ3tCKvMd-N-p03gnltB4rol4QUJshMk2oyMPO4,4143
 warn/scrapers/nj.py,sha256=nwbMbeQuUJbYRVoyUyKZBmNqvqsXu3Habt-10r8DvZE,2230
 warn/scrapers/nm.py,sha256=HZpfLzn0LvLeRztYvqJ9n6FR5PYpyMndo8tzI8h9S2o,3581
-warn/scrapers/ny.py,sha256=kuBdgF1C_GnHEWlaLAas6zJG2Xhfj3c4q_-tuJLX2rk,3615
+warn/scrapers/ny.py,sha256=hXbxPhiK-Eyc9h_05wkAsfdVIT0vayKX4EE5aiJVdBc,2291
 warn/scrapers/oh.py,sha256=2MEB_0AT37dsAsrhdl_Y0LUNHu0xGy4B1F7aSMhuUu0,3151
 warn/scrapers/ok.py,sha256=qJE49VY6dMhbokFB9IAOL2XyuYSJpEKKxITPO9sUHS4,1197
 warn/scrapers/or.py,sha256=0PjyrW3CHdxtHhqEo3Ob-9B6YckACoBD3K0c4FPQUcg,5208
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
 warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
 warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
 warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
-warn_scraper-1.2.101.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-warn_scraper-1.2.101.dist-info/METADATA,sha256=sgTsL26CgS1htJ6S-r0m62ljnet7Ilg4-zWjBSiJmdA,2385
-warn_scraper-1.2.101.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
-warn_scraper-1.2.101.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
-warn_scraper-1.2.101.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
-warn_scraper-1.2.101.dist-info/RECORD,,
+warn_scraper-1.2.103.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+warn_scraper-1.2.103.dist-info/METADATA,sha256=sszMf7V4gL8OsUkMeNzwFqW8Ii-wRXK_D1dZ9zhazBk,2385
+warn_scraper-1.2.103.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+warn_scraper-1.2.103.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
+warn_scraper-1.2.103.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
+warn_scraper-1.2.103.dist-info/RECORD,,

{warn_scraper-1.2.101.dist-info → warn_scraper-1.2.103.dist-info}/WHEEL RENAMED Viewed

File without changes

{warn_scraper-1.2.101.dist-info → warn_scraper-1.2.103.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{warn_scraper-1.2.101.dist-info → warn_scraper-1.2.103.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{warn_scraper-1.2.101.dist-info → warn_scraper-1.2.103.dist-info}/top_level.txt RENAMED Viewed

File without changes

warn-scraper 1.2.101__py3-none-any.whl → 1.2.103__py3-none-any.whl

warn-scraper 1.2.101py3-none-any.whl → 1.2.103py3-none-any.whl