PyPI - warn-scraper - Versions diffs - 1.2.72__py3-none-any.whl → 1.2.74__py3-none-any.whl - Mend

warn-scraper 1.2.72py3-none-any.whl → 1.2.74py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

warn/scrapers/ca.py CHANGED Viewed

@@ -224,20 +224,25 @@ def _extract_pdf_data(pdf_path):
             if "summary" in first_cell:
                 continue
             for row in rows:
-                data_row = {}
-                for i, value in enumerate(row):
-                    this_raw_header = raw_header[i]
-                    this_clean_header = header_crosswalk[this_raw_header]
-                    data_row[this_clean_header] = value
-                # Data clean-ups
-                data_row.update(
-                    {
-                        "effective_date": data_row["effective_date"].replace(" ", ""),
-                        "received_date": data_row["received_date"].replace(" ", ""),
-                        "source_file": str(pdf_path).split("/")[-1],
-                    }
-                )
-                data.append(data_row)
+                # Summary rows have an extra field, and the above code does not
+                # block the summary table from being parsed if it jumps onto another page.
+                if len(row) != len(raw_header) + 1:
+                    data_row = {}
+                    for i, value in enumerate(row):
+                        this_raw_header = raw_header[i]
+                        this_clean_header = header_crosswalk[this_raw_header]
+                        data_row[this_clean_header] = value
+                    # Data clean-ups
+                    data_row.update(
+                        {
+                            "effective_date": data_row["effective_date"].replace(
+                                " ", ""
+                            ),
+                            "received_date": data_row["received_date"].replace(" ", ""),
+                            "source_file": str(pdf_path).split("/")[-1],
+                        }
+                    )
+                    data.append(data_row)
     return data

warn/scrapers/tn.py CHANGED Viewed

@@ -1,14 +1,14 @@
+import csv
 import typing
 from pathlib import Path
-import pdfplumber
 from bs4 import BeautifulSoup
 from .. import utils
 from ..cache import Cache
-__authors__ = ["anikasikka"]
-__tags__ = ["html", "pdf"]
+__authors__ = ["anikasikka", "stucka"]
+__tags__ = ["html"]
 __source__ = {
     "name": "Tennessee Department of Labor and Workforce Development",
     "url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html",
@@ -37,13 +37,11 @@ def scrape(
     )
     html = page.text
     cache.write("tn/source.html", html)
+    soup = BeautifulSoup(html, "html5lib")
+    tables = soup.find_all(attrs={"class": "tn-datatable"})
+    rows = BeautifulSoup(str(tables), "html5lib").find_all("tr")
-    # Grab the PDF with the archived historial data
-    pdf_url = "https://www.tn.gov/content/dam/tn/workforce/documents/majorpublications/reports/WarnReportByMonth.pdf"
-    pdf_file = cache.download("tn/pdffile.pdf", pdf_url)
-    # Set the headers we'll use for both sources
-    tn_headers = [
+    dataheaders: typing.List = [
         "Notice Date",
         "Effective Date",
         "Received Date",
@@ -53,102 +51,42 @@ def scrape(
         "No. Of Employees",
         "Layoff/Closure",
         "Notice ID",
+        # "Notice URL",
     ]
-    cleaned_data: typing.List[typing.Any] = [tn_headers]
-    # Parse the latest HTML file and convert to a list of rows, with a header in the first row.
-    soup = BeautifulSoup(html, "html5lib")
+    staginglist: typing.List = []
+    for row in reversed(rows):
+        cells = row.find_all("td")
+        if len(cells) == 6:  # Filter for potentially valid rows
+            line: typing.Dict = {}
+            for item in dataheaders:  # Build an ordered dictionary with null values
+                line[item] = None
+            line["Notice Date"] = cells[0].text.strip()
+            line["Effective Date"] = cells[4].text.strip()
+            line["Company"] = cells[1].text.strip()
+            line["County"] = cells[2].text.strip()
+            line["No. Of Employees"] = cells[3].text.strip()
+            line["Notice ID"] = cells[5].text.strip()
+            # line['Notice URL'] = cells[1].find("a")['href']
+            staginglist.append(line)
+    # Bring in historical data
+    historical_file = cache_dir / "tn/tn_historical.csv"
+    historical_url = (
+        "https://storage.googleapis.com/bln-data-public/warn-layoffs/tn_historical.csv"
+    )
+    utils.fetch_if_not_cached(historical_file, historical_url)
+    historical_str = cache.read("tn/tn_historical.csv")
-    # Grab all the list items on the page
-    data_list = soup.find_all("p")
-    # Loop through them all, skipping the first item, which is a header
-    for data in data_list[1:]:
-        # splitting the data on its delimiter
-        items = str(data).split("|")
-        # making sure that the last item in the list is the data value of interest
-        # splitting based on last character of each text-html data sequence
-        raw_data = []
-        for item in items:
-            value_html = item.split(":")[-1]
-            value_soup = BeautifulSoup(value_html, "html5lib")
-            string_list = list(value_soup.stripped_strings)
-            if len(string_list) > 0:
-                value = string_list[-1]
-            else:
-                continue
-            raw_data.append(value)
-        # If there aren't six entries it's junk
-        if len(raw_data) != 6:
-            continue
-        # Pluck out the values we want
-        nice_data = [
-            raw_data[0],  # Notice Date
-            raw_data[4],  # Effective Date
-            "",  # Received Date
-            raw_data[1],  # Company
-            "",  # City
-            raw_data[2],  # County
-            raw_data[3],  # Number of employees
-            "",  # Layoff/Closure
-            raw_data[5],  # Notice ID
-        ]
-        # Add them to the master list
-        cleaned_data.append(nice_data)
-    # The PDF header blacklist of rows to toss
-    pdf_header_blacklist = [
-        "Notice Date",
-        "Total",
-    ]
+    historicallist = list(csv.DictReader(historical_str.splitlines()))
+    # Combine fresh and historical
+    staginglist.extend(historicallist)
-    # Open the PDF
-    with pdfplumber.open(pdf_file) as pdf:
-        # Loop through all the pages
-        for i, my_page in enumerate(pdf.pages):
-            # Sll even pages have data, odd pages don't have the data
-            if i % 2 != 0:
-                continue
-            # Pull out the table and loop through the rows
-            table = my_page.extract_table()
-            if not table:
-                continue
-            # Cut empty rows
-            row_list = [r for r in table if any(r)]
-            if not row_list:
-                continue
-            # If this is a summary table, skip it
-            first_cell = row_list[0][0]
-            assert first_cell
-            if first_cell.lower().strip() == "summary by month":
-                continue
-            # Loop through all the rows ...
-            for row in row_list:
-                # Skip remove redundant headers
-                if row[0] in pdf_header_blacklist:
-                    continue
-                # Toss in an empty Notice ID since it isn't in the PDF
-                row.append("")
-                # Add the data to our output
-                cleaned_data.append(row)
-    # Set the path to the final CSV
     output_csv = data_dir / "tn.csv"
-    # Write out the rows to the export directory
-    utils.write_rows_to_csv(output_csv, cleaned_data)
+    utils.write_dict_rows_to_csv(output_csv, dataheaders, staginglist)
-    # Return the path to the final CSV
     return output_csv

{warn_scraper-1.2.72.dist-info → warn_scraper-1.2.74.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: warn-scraper
-Version: 1.2.72
+Version: 1.2.74
 Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
 Home-page: https://github.com/biglocalnews/warn-scraper
 Author: Big Local News

{warn_scraper-1.2.72.dist-info → warn_scraper-1.2.74.dist-info}/RECORD RENAMED Viewed

@@ -28,7 +28,7 @@ warn/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 warn/scrapers/ak.py,sha256=h7BYMTV0whwWAPhbzVDVKMMoVCFphKly70aiTHabPq4,1847
 warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
 warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
-warn/scrapers/ca.py,sha256=_LvkIci1nTUKBt5KC-wEcazWG7zoeUeadxj4D0XD97k,8170
+warn/scrapers/ca.py,sha256=rBTB-6LmNIlbGCqrCtI3O-w2e_0kcVSFxvjvh4EHBlk,8511
 warn/scrapers/co.py,sha256=g076Zqe8XA8tbW03HP6-03mJV8fft1niHfa5Sy6me9A,7388
 warn/scrapers/ct.py,sha256=HLMmBSFhT5Y3vZQUwRyCTxiG5BMQXTfG3SEj5rkQEL4,4771
 warn/scrapers/dc.py,sha256=_sHLnVqK_W90QqJb_W88yDlgPjoMl63LYZP3CJfdN9g,4484
@@ -58,16 +58,16 @@ warn/scrapers/or.py,sha256=0PjyrW3CHdxtHhqEo3Ob-9B6YckACoBD3K0c4FPQUcg,5208
 warn/scrapers/ri.py,sha256=vBbXFP5ClvqlOc_srR8sHsA8lpi7eLuMYm7ydUY5Fxo,4163
 warn/scrapers/sc.py,sha256=p3kscSNSW9C8C5QaSUbCAo6XibgB7G2iH6zaMH7Mnsc,4819
 warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
-warn/scrapers/tn.py,sha256=NBGAjNZ1_-a08Qdu0Y8PnxIjcqAozqdZAvXSAbPae-A,4770
+warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
 warn/scrapers/tx.py,sha256=7lRIA13CyU1taYdxDA-t6uRn5q13Cr3oR1SNaEe3Dlg,4329
 warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
 warn/scrapers/va.py,sha256=13lhkQrSkPGHEiWUuf1qiS890PWYE5gV-TgISpoiQnc,1711
 warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
 warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
 warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
-warn_scraper-1.2.72.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-warn_scraper-1.2.72.dist-info/METADATA,sha256=3SjdPSlNFIixBPUEqU3LKNIm6FnBkIc9ZYkYOJsRK9Y,2025
-warn_scraper-1.2.72.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-warn_scraper-1.2.72.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
-warn_scraper-1.2.72.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
-warn_scraper-1.2.72.dist-info/RECORD,,
+warn_scraper-1.2.74.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+warn_scraper-1.2.74.dist-info/METADATA,sha256=qUTAC44XVu9ARQFObvm3h9aJe0dAduopZX4-bP18i3k,2025
+warn_scraper-1.2.74.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+warn_scraper-1.2.74.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
+warn_scraper-1.2.74.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
+warn_scraper-1.2.74.dist-info/RECORD,,

{warn_scraper-1.2.72.dist-info → warn_scraper-1.2.74.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.42.0)
+Generator: bdist_wheel (0.43.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{warn_scraper-1.2.72.dist-info → warn_scraper-1.2.74.dist-info}/LICENSE RENAMED Viewed

File without changes

{warn_scraper-1.2.72.dist-info → warn_scraper-1.2.74.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{warn_scraper-1.2.72.dist-info → warn_scraper-1.2.74.dist-info}/top_level.txt RENAMED Viewed

File without changes

warn-scraper 1.2.72__py3-none-any.whl → 1.2.74__py3-none-any.whl

warn-scraper 1.2.72py3-none-any.whl → 1.2.74py3-none-any.whl