PyPI - warn-scraper - Versions diffs - 1.2.108__py3-none-any.whl → 1.2.110__py3-none-any.whl - Mend

warn-scraper 1.2.108py3-none-any.whl → 1.2.110py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

warn/scrapers/co.py CHANGED Viewed

@@ -6,7 +6,7 @@ from bs4 import BeautifulSoup, Tag
 from .. import utils
 from ..cache import Cache
-__authors__ = ["anikasikka"]
+__authors__ = ["anikasikka", "stucka"]
 __tags__ = ["html"]
 __source__ = {
     "name": "Colorado Department of Labor and Employment",
@@ -53,13 +53,29 @@ def scrape(
     else:
         raise ValueError("Could not find Google Sheet link")
+    # Scraper had been working off partially loaded impression of the HTML in the dOM.
+    # This keyboard is not helping.
+    # Anyway, instead of trying to get a partially complete version and parse the HTML there,
+    # let's try to get the actual HTML export of the page.
+    # 2016 has a different filename schema we need to account for.
+    if "/edit" in current_href:
+        better_link = current_href.split("/edit")[0] + "/gviz/tq?tqx=out:html"  # type: ignore
+    elif "drive.google.com/open?id=" in current_href:  # Work from the ID
+        better_link = "https://docs.google.com/spreadsheets/d/"
+        better_link += current_href.split("open?id=")[-1]  # type: ignore
+        better_link += "/gviz/tq?tqx=out:html"
+    else:
+        raise ValueError(f"Could not adapt {current_href} to find HTML export.")
     # Open the Google Sheet
-    current_page = utils.get_url(current_href)
+    current_page = utils.get_url(better_link)
     current_html = current_page.text
     # Parse the Google Sheet
     soup_current = BeautifulSoup(current_html, "html5lib")
-    table = soup_current.find(class_="waffle")
+    # table = soup_current.find(class_="waffle")
+    table = soup_current.find("table")
     cleaned_data = scrape_google_sheets(table)
     # Goes through the accordion links to get past data
@@ -78,9 +94,26 @@ def scrape(
     link_list = [a for a in accordion.find_all("a") if "feedback" not in a.text]
     logger.debug(f"Requesting {len(link_list)} discovered links")
     for link in link_list:
-        page = utils.get_url(link["href"])
+        bad_url = link["href"]
+        # Scraper had been working off partially loaded impression of the HTML in the dOM.
+        # This keyboard is not helping.
+        # Anyway, instead of trying to get a partially complete version and parse the HTML there,
+        # let's try to get the actual HTML export of the page.
+        # 2016 has a different filename schema we need to account for.
+        if "/edit" in bad_url:
+            better_link = bad_url.split("/edit")[0] + "/gviz/tq?tqx=out:html"
+        elif "drive.google.com/open?id=" in bad_url:
+            better_link = "https://docs.google.com/spreadsheets/d/"
+            better_link += bad_url.split("open?id=")[-1]  # Get just the Id
+            better_link += "/gviz/tq?tqx=out:html"
+        else:
+            raise ValueError(f"Could not adapt {bad_url} to find HTML export.")
+        page = utils.get_url(better_link)
         soup = BeautifulSoup(page.text, "html5lib")
-        table = soup.find(class_="waffle")
+        table = soup.find("table")
         if "2017" in link.text:
             header_list = [
                 "Company",
@@ -105,9 +138,9 @@ def scrape(
     # Clean up the headers
     header_crosswalk = {
-        "Name": "company",
         "Company Name": "company",
         "Company": "company",
+        "Name": "company",
         "WARN Date": "notice_date",
         "Total Layoffs": "jobs",
         "NAICS": "naics",
@@ -151,17 +184,108 @@ def scrape(
         "@dropdown": "dropdown",
         "Received": "received_date",
         "Notes": "notes",
+        # Only add new matches above here, not below here.
     }
+    header_garbage = {
+        # And then it got ugly with some columns getting unhidden.
+        "Timestamp": "timestamp",
+        "Email Address": "email_address",
+        "Is this a NEW WARN or a REVISION?": "is_this_a_new_warn_or_a_revision",
+        "Total number of employees with reduced hours": "total_number_of_employees_with_reduced_hours",
+        "Include the total number of employees on or expected to be on a Workshare plan.": "include_the_total_number_of_employees_on_or_expected_to_be_on_a_workshare_plan",
+        "Expected date of second job losses at location 1": "expected_date_of_second_job_losses_at_location_1",
+        "Expected end date of second job losses at location 1": "expected_end_date_of_second_job_losses_at_location_1",
+        "Expected date of third job losses at location 1": "expected_date_of_third_job_losses_at_location_1",
+        "Expected end date of third job losses at location 1": "expected_end_date_of_third_job_losses_at_location_1",
+        "Do the employees have bumping rights?": "do_the_employees_have_bumping_rights",
+        "Are the employees represented by a union?": "are_the_employees_represented_by_a_union",
+        "If you selected Rural Consortium for the workforce area, please choose a subarea using the map.": "if_you_selected_rural_consortium_for_the_workforce_area_please_choose_a_subarea_using_the_map",
+        "Name of union(s)": "name_of_unions",
+        "Contact phone number for union representative(s)": "contact_phone_number_for_union_representatives",
+        "Email address for union representative(s)": "email_address_for_union_representatives",
+        "Address, City, ZIP for Union 1": "address_city_zip_for_union_1",
+        "Has a second location been impacted?": "has_a_second_location_been_impacted",
+        "Location 2 Address": "location_2_address",
+        "Total number of employees at location 2": "total_number_of_employees_at_location_2",
+        "Total number of permanent layoffs at location 2": "total_number_of_permanent_layoffs_at_location_2",
+        "Total number of temporary layoffs at location 2": "total_number_of_temporary_layoffs_at_location_2",
+        "Total number of furloughs at location 2": "total_number_of_furloughs_at_location_2",
+        "Total number of employees with reduced hours at location 2": "total_number_of_employees_with_reduced_hours_at_location_2",
+        "Total number of employees on workshare plan at location 2": "total_number_of_employees_on_workshare_plan_at_location_2",
+        "Occupations Impacted at location 2": "occupations_impacted_at_location_2",
+        "Expected date of first job losses at location 2": "expected_date_of_first_job_losses_at_location_2",
+        "Contact name(s) for union representative(s)": "contact_names_for_union_representatives",
+        "Expected end date of first job losses at location 2": "expected_end_date_of_first_job_losses_at_location_2",
+        "Expected date of second job losses at location 2": "expected_date_of_second_job_losses_at_location_2",
+        "Expected end date of second job losses at location 2": "expected_end_date_of_second_job_losses_at_location_2",
+        "Expected date of third job losses at location 2": "expected_date_of_third_job_losses_at_location_2",
+        "Expected end date of third job losses at location 2": "expected_end_date_of_third_job_losses_at_location_2",
+        "Reason for Layoffs at location 2": "reason_for_layoffs_at_location_2",
+        "Do employees at location 2 having bumping rights?": "do_employees_at_location_2_having_bumping_rights",
+        "Are employees at location 2 represented by a union?": "are_employees_at_location_2_represented_by_a_union",
+        "Select the workforce area for location 2": "select_the_workforce_area_for_location_2",
+        "If you selected Other/Sub-Area, please choose a location from the following dropdown menu:": "if_you_selected_othersub_area_please_choose_a_location_from_the_following_dropdown_menu",
+        "Name of Union 2": "name_of_union_2",
+        "Contact name for Union 2": "contact_name_for_union_2",
+        "Contact phone number for Union 2": "contact_phone_number_for_union_2",
+        "Email address for Union 2": "email_address_for_union_2",
+        "Address, City, ZIP for Union 2": "address_city_zip_for_union_2",
+        "Has a third location been impacted?": "has_a_third_location_been_impacted",
+        "Location 3 Address": "location_3_address",
+        "Total number of employees at location 3": "total_number_of_employees_at_location_3",
+        "Total number of permanent layoffs at location 3": "total_number_of_permanent_layoffs_at_location_3",
+        "Total number of temporary layoffs at location 3": "total_number_of_temporary_layoffs_at_location_3",
+        "Total number of furloughs at location 3": "total_number_of_furloughs_at_location_3",
+        "Total number of employees with reduced hours at location 3": "total_number_of_employees_with_reduced_hours_at_location_3",
+        "Total number of employees on workshare plan at location 3": "total_number_of_employees_on_workshare_plan_at_location_3",
+        "Occupations Impacted at location 3": "occupations_impacted_at_location_3",
+        "Expected date of first job losses at location 3": "expected_date_of_first_job_losses_at_location_3",
+        "Expected end date of first job losses at location 3": "expected_end_date_of_first_job_losses_at_location_3",
+        "Expected date of second job losses at location 3": "expected_date_of_second_job_losses_at_location_3",
+        "Expected end date of second job losses at location 3": "expected_end_date_of_second_job_losses_at_location_3",
+        "Expected date of third job losses at location 3": "expected_date_of_third_job_losses_at_location_3",
+        "Expected end date of third job losses at location 3": "expected_end_date_of_third_job_losses_at_location_3",
+        "Reason for Layoffs at location 3": "reason_for_layoffs_at_location_3",
+        "Do employees at location 3 having bumping rights?": "do_employees_at_location_3_having_bumping_rights",
+        "Are employees at location 3 represented by a union?": "are_employees_at_location_3_represented_by_a_union",
+        "Select the workforce area for location 3": "select_the_workforce_area_for_location_3",
+        "Name of Union 3": "name_of_union_3",
+        "Contact name for Union 3": "contact_name_for_union_3",
+        "Contact phone number for Union 3": "contact_phone_number_for_union_3",
+        "Email address for Union 3": "email_address_for_union_3",
+        "Address, City, ZIP for Union 3": "address_city_zip_for_union_3",
+        "Include here any comments or additional details": "include_here_any_comments_or_additional_details",
+        # This is for garbage, not legit crosswalk. You probably do not want to add here.
+    }
     standardized_data = []
     for row in cleaned_data:
         row_dict = {}
+        mangled = []
+        for key in row:
+            if (
+                key not in header_crosswalk and key not in header_garbage
+            ):  # Get all missing keys at once
+                mangled.append(key)
+        if len(mangled) > 0:
+            logger.warning(f"Missing a bunch of keys: {'|'.join(mangled)}")
         for key, value in row.items():
-            standardized_key = header_crosswalk[key]
-            row_dict[standardized_key] = value
+            if (
+                key not in header_crosswalk and key not in header_garbage
+            ):  # If we've never seen this before
+                logger.warning(f"Could not find {key} in header_crosswalk")
+                logger.warning(row)
+            if key not in header_garbage:  # if it's in the crosswalk, if it's legit
+                standardized_key = header_crosswalk[key]
+                row_dict[standardized_key] = value
         if len(row_dict["company"]) < 3 and row_dict["letter"] == "Avis Budget Group":
             row_dict["company"] = "Avis Budget Group"
         if len(row_dict["company"]) < 3:  # or len(row_dict['naics']) <5:
             logger.debug(f"Dropping row of questionable quality: {row_dict}")
+        elif "begin_date" in row_dict and row_dict["begin_date"] == "Layoff Date(s)":
+            logger.debug(f"Dropping row of questionable quality: {row_dict}")
         else:
             standardized_data.append(row_dict)
@@ -188,23 +312,26 @@ def scrape_google_sheets(table, header_list=None):
     Returns: The parsed data as a list of dictionaries
     """
+    # logger.debug(table)
     # If a header list isn't provided, pull one out automatically
     if not header_list:
         # Pull out the header row
-        header_soup = table.find_all("tr")[1]
+        # header_soup = table.find_all("tr")[1]
+        header_soup = table.find_all("tr")[0]
         # Parse the header row into a list,
         # preserving its order in the sheet
         header_list = []
-        for cell in header_soup.find_all("td"):
+        for cellindex, cell in enumerate(header_soup.find_all("td")):
             cell_text = cell.text.strip()
             # Skip empty headers
             if cell_text:
                 header_list.append(cell_text)
+            if not cell_text and cellindex == 0:
+                header_list.append("Company Name")
     # Loop through all the data rows, which start
     # after the header and the little bar
-    tr_list = table.find_all("tr")[3:]
+    tr_list = table.find_all("tr")[1:]
     logger.debug(f"Parsing {len(tr_list)} rows")
     row_list = []
     for row in tr_list:

warn/scrapers/va.py CHANGED Viewed

@@ -189,10 +189,20 @@ def scrape(
         logger.debug(f"Attempting to fetch {start_page}")
         driver.get(start_page)
-        sleep((4 * random()) + 3)
-        driver.find_element(By.ID, "warn-notice-well").find_element(
-            By.PARTIAL_LINK_TEXT, "Download"
-        ).click()
+        sleep((4 * random()) + 8)
+        # with open("va-weird.html", "w") as outfile:
+        #     outfile.write(driver.page_source)
+        # driver.find_element(By.ID, "warn-notice-well").find_element(
+        #     By.PARTIAL_LINK_TEXT, "Download"
+        # ).click()
+        # driver.find_element(By.PARTIAL_LINK_TEXT, "Download Full List of WARN notices").click()
+        # element = driver.find_element(By.CSS_SELECTOR, "#warn-notice-well a img")
+        element = driver.find_element(By.CSS_SELECTOR, "#warn-notice-well a")
+        logger.debug(f"Element found: {element.get_attribute('outerHTML')}")
+        driver.execute_script("arguments[0].click();", element)
+        # element.click()
         logger.debug(f"Attempting to fetch {csv_url}")
         # driver.get(csv_url)

{warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warn-scraper
-Version: 1.2.108
+Version: 1.2.110
 Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
 Home-page: https://github.com/biglocalnews/warn-scraper
 Author: Big Local News

{warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/RECORD RENAMED Viewed

@@ -29,7 +29,7 @@ warn/scrapers/ak.py,sha256=h7BYMTV0whwWAPhbzVDVKMMoVCFphKly70aiTHabPq4,1847
 warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
 warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
 warn/scrapers/ca.py,sha256=VQOfjHXPCc-jYwh-EPGVVfnzvXB7pdmCt2uJ6QnMPRM,8600
-warn/scrapers/co.py,sha256=hUfqrzlhXQBkP4vxewVRrMZrgInoLer5S2MZlyYIQE4,7878
+warn/scrapers/co.py,sha256=YZP7BHluM6tmwdQ45ZpYzLvq1S7jHGBH-4ZXlSKDP5M,17509
 warn/scrapers/ct.py,sha256=PKeZtlB0-z2wCmYmGl_WYoVo2gzwKV36upZcJVaJxjM,4852
 warn/scrapers/dc.py,sha256=p1_c7O2R3O-41DmvcLVUIRhQKUewvZZKkzWkBxytN5M,5165
 warn/scrapers/de.py,sha256=GyM92A-lFwZAfRxgbO-sIWhRfmBEKirzchaPIv-u0o4,1364
@@ -61,13 +61,13 @@ warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
 warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
 warn/scrapers/tx.py,sha256=watfR1gyN9w7nluiAOnnIghEmoq3eShNUzYSZ8SkZy4,4438
 warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
-warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
+warn/scrapers/va.py,sha256=7Nle7qL0VNPiE653XyaP9HQqSfuJFDRr2kEkjOqLvFM,11269
 warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
 warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
 warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
-warn_scraper-1.2.108.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
-warn_scraper-1.2.108.dist-info/METADATA,sha256=96NzaNp4gCcgxEFEq7gS0PpJNiiocMEcdJ9KvfVro-8,2385
-warn_scraper-1.2.108.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
-warn_scraper-1.2.108.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
-warn_scraper-1.2.108.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
-warn_scraper-1.2.108.dist-info/RECORD,,
+warn_scraper-1.2.110.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
+warn_scraper-1.2.110.dist-info/METADATA,sha256=oyI5DrPOS62n4q0ElEvm9R8fwzTkNOus3MLwzzQYLic,2385
+warn_scraper-1.2.110.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+warn_scraper-1.2.110.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
+warn_scraper-1.2.110.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
+warn_scraper-1.2.110.dist-info/RECORD,,

{warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/WHEEL RENAMED Viewed

File without changes

{warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/top_level.txt RENAMED Viewed

File without changes

warn-scraper 1.2.108__py3-none-any.whl → 1.2.110__py3-none-any.whl

warn-scraper 1.2.108py3-none-any.whl → 1.2.110py3-none-any.whl