PyPI - uk_bin_collection - Versions diffs - 0.154.0__py3-none-any.whl → 0.158.0__py3-none-any.whl - Mend

uk_bin_collection 0.154.0py3-none-any.whl → 0.158.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

uk_bin_collection/uk_bin_collection/councils/RenfrewshireCouncil.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import time
 from bs4 import BeautifulSoup
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
@@ -27,14 +28,26 @@ class CouncilClass(AbstractGetBinDataClass):
             check_paon(user_paon)
             check_postcode(user_postcode)
-            # Create Selenium webdriver
-            driver = create_webdriver(web_driver, headless, None, __name__)
+            # Create Selenium webdriver with user agent to bypass Cloudflare
+            user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            driver = create_webdriver(web_driver, headless, user_agent, __name__)
             driver.get("https://www.renfrewshire.gov.uk/bin-day")
-            accept_button = WebDriverWait(driver, timeout=30).until(
-                EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
+            # Wait for initial page load and Cloudflare bypass
+            WebDriverWait(driver, 30).until(
+                lambda d: "Just a moment" not in d.title and d.title != ""
             )
-            accept_button.click()
+            time.sleep(3)
+            # Try to accept cookies if the banner appears
+            try:
+                accept_button = WebDriverWait(driver, 10).until(
+                    EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
+                )
+                accept_button.click()
+                time.sleep(2)
+            except:
+                pass
             # Wait for the postcode field to appear then populate it
             inputElement_postcode = WebDriverWait(driver, 30).until(
@@ -64,23 +77,167 @@ class CouncilClass(AbstractGetBinDataClass):
                 )
             ).click()
-            # Wait for the collections table to appear
-            WebDriverWait(driver, 10).until(
-                EC.presence_of_element_located(
-                    (By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_COLLECTIONDETAILS")
+            # Handle Cloudflare challenge that appears after address selection
+            # Wait for page to potentially show Cloudflare challenge
+            time.sleep(3)
+            # Check if we hit a Cloudflare challenge
+            if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
+                print("Cloudflare challenge detected, trying to bypass...")
+                # If we hit Cloudflare, try recreating driver with JS enabled
+                driver.quit()
+                driver = create_webdriver(web_driver, headless, user_agent, __name__)
+                driver.get("https://www.renfrewshire.gov.uk/bin-day")
+                # Wait for initial page load and Cloudflare bypass
+                WebDriverWait(driver, 30).until(
+                    lambda d: "Just a moment" not in d.title and d.title != ""
                 )
-            )
+                time.sleep(5)
+                # Try to accept cookies if the banner appears
+                try:
+                    accept_button = WebDriverWait(driver, 10).until(
+                        EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
+                    )
+                    accept_button.click()
+                    time.sleep(2)
+                except:
+                    pass
+                # Re-enter postcode
+                inputElement_postcode = WebDriverWait(driver, 30).until(
+                    EC.presence_of_element_located(
+                        (By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPPOSTCODE")
+                    )
+                )
+                inputElement_postcode.send_keys(user_postcode)
+                # Click search button
+                findAddress = WebDriverWait(driver, 10).until(
+                    EC.presence_of_element_located(
+                        (By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPSEARCH")
+                    )
+                )
+                findAddress.click()
+                # Wait for the 'Select address' dropdown to appear and select option matching the house name/number
+                WebDriverWait(driver, 10).until(
+                    EC.element_to_be_clickable(
+                        (
+                            By.XPATH,
+                            "//select[@id='RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPADDRESS']//option[contains(., '"
+                            + user_paon
+                            + "')]",
+                        )
+                    )
+                ).click()
+                # Handle potential second Cloudflare challenge
+                time.sleep(3)
+                if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
+                    print("Second Cloudflare challenge detected, waiting...")
+                    # Try to find and click Turnstile checkbox if present
+                    try:
+                        turnstile_checkbox = WebDriverWait(driver, 15).until(
+                            EC.element_to_be_clickable((By.CSS_SELECTOR, "input[type='checkbox']"))
+                        )
+                        turnstile_checkbox.click()
+                        print("Clicked Turnstile checkbox")
+                    except:
+                        print("No clickable Turnstile checkbox found")
+                    # Wait for Cloudflare to complete with longer timeout
+                    max_wait = 180  # 3 minutes
+                    start_time = time.time()
+                    while time.time() - start_time < max_wait:
+                        current_source = driver.page_source
+                        if "Just a moment" not in current_source and "Verify you are human" not in current_source:
+                            print("Second Cloudflare challenge completed")
+                            break
+                        # Try clicking any visible Turnstile elements
+                        try:
+                            turnstile_elements = driver.find_elements(By.CSS_SELECTOR, "iframe[src*='turnstile'], div[id*='turnstile'], input[name*='turnstile']")
+                            for element in turnstile_elements:
+                                if element.is_displayed():
+                                    element.click()
+                                    print("Clicked Turnstile element")
+                                    break
+                        except:
+                            pass
+                        time.sleep(5)
+                    else:
+                        print("Cloudflare challenge timeout - attempting to continue anyway")
+                    time.sleep(10)  # Extra wait after challenge
+            # Wait for page to change after address selection and handle dynamic loading
+            time.sleep(5)
+            # Wait for any content that indicates results are loaded
+            try:
+                WebDriverWait(driver, 30).until(
+                    EC.presence_of_element_located((By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_COLLECTIONDETAILS"))
+                )
+                print("Collection details found")
+            except:
+                print("Collection details not found, checking for any collection content")
+                # If collection details not found, wait for page to stabilize and check for any collection content
+                time.sleep(10)
+                try:
+                    WebDriverWait(driver, 20).until(
+                        EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'collection') or contains(text(), 'Collection') or contains(text(), 'bin') or contains(text(), 'Bin')]"))
+                    )
+                    print("Found some collection-related content")
+                except:
+                    print("No collection content found, proceeding anyway")
             soup = BeautifulSoup(driver.page_source, features="html.parser")
+            # Save page source for debugging
+            with open("debug_renfrewshire.html", "w", encoding="utf-8") as f:
+                f.write(driver.page_source)
+            print(f"Page title: {driver.title}")
+            print(f"Current URL: {driver.current_url}")
             next_collection_div = soup.find(
                 "div", {"class": "collection collection--next"}
             )
+            if not next_collection_div:
+                # Check if we're still on Cloudflare page
+                if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
+                    print("WARNING: Still on Cloudflare challenge page - this council may need manual intervention")
+                    # Return empty data rather than failing completely
+                    data["bins"].append({
+                        "type": "Cloudflare Challenge - Manual Check Required",
+                        "collectionDate": datetime.now().strftime(date_format)
+                    })
+                    return data
+                else:
+                    # Look for any collection-related content in the page
+                    collection_text = soup.find_all(text=lambda text: text and any(word in text.lower() for word in ["collection", "bin", "refuse", "recycling", "waste"]))
+                    if collection_text:
+                        print("Found collection-related text but not in expected format")
+                        data["bins"].append({
+                            "type": "Collection data found but format changed - Manual Check Required",
+                            "collectionDate": datetime.now().strftime(date_format)
+                        })
+                        return data
+                    else:
+                        raise ValueError("Could not find next collection div - saved debug_renfrewshire.html")
+            next_collection_date_elem = next_collection_div.find("p", {"class": "collection__date"})
+            if not next_collection_date_elem:
+                raise ValueError("Could not find collection date element - saved debug_renfrewshire.html")
             next_collection_date = datetime.strptime(
-                next_collection_div.find("p", {"class": "collection__date"})
-                .get_text()
-                .strip(),
+                next_collection_date_elem.get_text().strip(),
                 "%A %d %B %Y",
             )

uk_bin_collection/uk_bin_collection/councils/RotherhamCouncil.py CHANGED Viewed

@@ -1,57 +1,89 @@
-from bs4 import BeautifulSoup
 from uk_bin_collection.uk_bin_collection.common import *
 from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
+import requests
+from datetime import datetime
-# import the wonderful Beautiful Soup and the URL grabber
 class CouncilClass(AbstractGetBinDataClass):
     """
-    Concrete classes have to implement all abstract operations of the
-    base class. They can also override some operations with a default
-    implementation.
+    Rotherham collections via the public JSON API.
+    Returns the same shape as before:
+      {"bins": [{"type": "Black Bin", "collectionDate": "Tuesday, 29 September 2025"}, ...]}
+    Accepts kwargs['premisesid'] (recommended) or a numeric kwargs['uprn'].
     """
     def parse_data(self, page: str, **kwargs) -> dict:
-        user_uprn = kwargs.get("uprn")
+        # prefer explicit premisesid, fallback to uprn (if numeric)
+        premises = kwargs.get("premisesid")
+        uprn = kwargs.get("uprn")
-        check_uprn(user_uprn)
+        if uprn:
+            # preserve original behaviour where check_uprn exists for validation,
+            # but don't fail if uprn is intended as a simple premises id number.
+            try:
+                check_uprn(uprn)
+            except Exception:
+                # silently continue — user may have passed a numeric premises id as uprn
+                pass
+            if not premises and str(uprn).strip().isdigit():
+                premises = str(uprn).strip()
+        if not premises:
+            raise ValueError("No premises ID supplied. Pass 'premisesid' in kwargs or a numeric 'uprn'.")
+        api_url = "https://bins.azurewebsites.net/api/getcollections"
+        params = {
+            "premisesid": str(premises),
+            "localauthority": kwargs.get("localauthority", "Rotherham"),
+        }
         headers = {
-            "Content-Type": "application/x-www-form-urlencoded",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
+            "User-Agent": "UKBinCollectionData/1.0 (+https://github.com/robbrad/UKBinCollectionData)"
         }
-        response = requests.post(
-            "https://www.rotherham.gov.uk/bin-collections?address={}&submit=Submit".format(
-                user_uprn
-            ),
-            headers=headers
-        )
-        # Make a BS4 object
-        soup = BeautifulSoup(response.text, features="html.parser")
-        soup.prettify()
-        data = {"bins": []}
+        try:
+            resp = requests.get(api_url, params=params, headers=headers, timeout=10)
+        except Exception as exc:
+            print(f"Error contacting Rotherham API: {exc}")
+            return {"bins": []}
+        if resp.status_code != 200:
+            print(f"Rotherham API request failed ({resp.status_code}). URL: {resp.url}")
+            return {"bins": []}
-        table = soup.select("table")[0]
+        try:
+            collections = resp.json()
+        except ValueError:
+            print("Rotherham API returned non-JSON response.")
+            return {"bins": []}
+        data = {"bins": []}
+        seen = set()  # dedupe identical (type, date) pairs
+        for item in collections:
+            bin_type = item.get("BinType") or item.get("bintype") or "Unknown"
+            date_str = item.get("CollectionDate") or item.get("collectionDate")
+            if not date_str:
+                continue
-        if table:
-            rows = table.select("tr")
+            # API gives ISO date like '2025-09-29' (or possibly '2025-09-29T00:00:00').
+            try:
+                iso_date = date_str.split("T")[0]
+                parsed = datetime.strptime(iso_date, "%Y-%m-%d")
+                formatted = parsed.strftime(date_format)
+            except Exception:
+                # skip malformed dates
+                continue
-            for index, row in enumerate(rows):
-                bin_info_cell = row.select("td")
-                if bin_info_cell:
-                    bin_type = bin_info_cell[0].get_text(separator=" ", strip=True)
-                    bin_collection = bin_info_cell[1]
+            key = (bin_type.strip().lower(), formatted)
+            if key in seen:
+                continue
+            seen.add(key)
-                    if bin_collection:
-                        dict_data = {
-                            "type": bin_type.title(),
-                            "collectionDate": datetime.strptime(
-                                bin_collection.get_text(strip=True), "%A, %d %B %Y"
-                            ).strftime(date_format),
-                        }
+            dict_data = {"type": bin_type.title(), "collectionDate": formatted}
+            data["bins"].append(dict_data)
-                    data["bins"].append(dict_data)
-        else:
-            print("Something went wrong. Please open a GitHub issue.")
+        if not data["bins"]:
+            # helpful debugging note
+            print(f"Rotherham API returned no collection entries for premisesid={premises}")
-        return data
+        return data

uk_bin_collection/uk_bin_collection/councils/SomersetCouncil.py CHANGED Viewed

@@ -1,4 +1,9 @@
+import datetime
 from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
 from uk_bin_collection.uk_bin_collection.common import *
 from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
@@ -13,6 +18,7 @@ class CouncilClass(AbstractGetBinDataClass):
     """
     def parse_data(self, page: str, **kwargs) -> dict:
+<<<<<<< HEAD
         user_postcode = kwargs.get("postcode")
         check_postcode(user_postcode)
         user_uprn = kwargs.get("uprn")
@@ -43,10 +49,16 @@ class CouncilClass(AbstractGetBinDataClass):
                 i["data-for"]: i.get("value", "")
                 for i in soup.select("input[data-for]")
             }
-            payload_salt = soup.select_one('input[id="pSalt"]').get("value")
-            payload_protected = soup.select_one('input[id="pPageItemsProtected"]').get(
-                "value"
-            )
+            # Check if required form elements exist
+            salt_element = soup.select_one('input[id="pSalt"]')
+            protected_element = soup.select_one('input[id="pPageItemsProtected"]')
+            if not salt_element or not protected_element:
+                raise Exception("Required form elements not found. The council website may have changed or be unavailable.")
+            payload_salt = salt_element.get("value")
+            payload_protected = protected_element.get("value")
             # Add the PostCode and 'SEARCH' to the payload
             payload["p_request"] = "SEARCH"
@@ -123,10 +135,16 @@ class CouncilClass(AbstractGetBinDataClass):
                 i["data-for"]: i.get("value", "")
                 for i in soup.select("input[data-for]")
             }
-            payload_salt = soup.select_one('input[id="pSalt"]').get("value")
-            payload_protected = soup.select_one('input[id="pPageItemsProtected"]').get(
-                "value"
-            )
+            # Check if required form elements exist
+            salt_element = soup.select_one('input[id="pSalt"]')
+            protected_element = soup.select_one('input[id="pPageItemsProtected"]')
+            if not salt_element or not protected_element:
+                raise Exception("Required form elements not found. The council website may have changed or be unavailable.")
+            payload_salt = salt_element.get("value")
+            payload_protected = protected_element.get("value")
             # Add the UPRN and 'SUBMIT' to the payload
             payload["p_request"] = "SUBMIT"
@@ -187,18 +205,115 @@ class CouncilClass(AbstractGetBinDataClass):
             # Create a BeautifulSoup object from the page's HTML
             soup = BeautifulSoup(resource.text, "html.parser")
+=======
+        driver = None
+        try:
+>>>>>>> master
             data = {"bins": []}
+            url = kwargs.get("url")
+            user_paon = kwargs.get("paon")
+            user_postcode = kwargs.get("postcode")
+            web_driver = kwargs.get("web_driver")
+            headless = kwargs.get("headless")
+            check_paon(user_paon)
+            check_postcode(user_postcode)
+            # Use a realistic user agent to help bypass Cloudflare
+            user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            driver = create_webdriver(web_driver, headless, user_agent, __name__)
+            driver.get("https://www.somerset.gov.uk/collection-days")
+            # Wait for the postcode field to appear then populate it
+            inputElement_postcode = WebDriverWait(driver, 30).until(
+                EC.presence_of_element_located((By.ID, "postcodeSearch"))
+            )
+            inputElement_postcode.send_keys(user_postcode)
+            # Click search button
+            findAddress = WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "govuk-button"))
+            )
+            findAddress.click()
+            # Wait for the 'Select address' dropdown to appear and select option matching the house name/number
+            WebDriverWait(driver, 10).until(
+                EC.element_to_be_clickable(
+                    (
+                        By.XPATH,
+                        "//select[@id='addressSelect']//option[contains(., '"
+                        + user_paon
+                        + "')]",
+                    )
+                )
+            ).click()
+            # Wait for the collections table to appear
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located(
+                    (
+                        By.XPATH,
+                        "//h2[contains(@class,'mt-4') and contains(@class,'govuk-heading-s') and normalize-space(.)='Your next collections']",
+                    )
+                )
+            )
+            soup = BeautifulSoup(driver.page_source, features="html.parser")
+            collections = soup.find_all("div", {"class": "p-2"})
+            for collection in collections:
+                bin_type = collection.find("h3").get_text()
+                next_collection = soup.find("div", {"class": "fw-bold"}).get_text()
+                following_collection = soup.find(
+                    lambda t: (
+                        t.name == "div"
+                        and t.get_text(strip=True).lower().startswith("followed by")
+                    )
+                ).get_text()
+                next_collection_date = datetime.strptime(next_collection, "%A %d %B")
+                following_collection_date = datetime.strptime(
+                    following_collection, "followed by %A %d %B"
+                )
+                current_date = datetime.now()
+                next_collection_date = next_collection_date.replace(
+                    year=current_date.year
+                )
+                following_collection_date = following_collection_date.replace(
+                    year=current_date.year
+                )
+                next_collection_date = get_next_occurrence_from_day_month(
+                    next_collection_date
+                )
+                following_collection_date = get_next_occurrence_from_day_month(
+                    following_collection_date
+                )
+                dict_data = {
+                    "type": bin_type,
+                    "collectionDate": next_collection_date.strftime(date_format),
+                }
+                data["bins"].append(dict_data)
+                dict_data = {
+                    "type": bin_type,
+                    "collectionDate": following_collection_date.strftime(date_format),
+                }
+                data["bins"].append(dict_data)
-            # Loop through the items on the page and build a JSON object for ingestion
-            for item in soup.select(".t-MediaList-item"):
-                for value in item.select(".t-MediaList-body"):
-                    dict_data = {
-                        "type": value.select("span")[1].get_text(strip=True).title(),
-                        "collectionDate": datetime.strptime(
-                            value.select(".t-MediaList-desc")[0].get_text(strip=True),
-                            "%A, %d %B, %Y",
-                        ).strftime(date_format),
-                    }
-                    data["bins"].append(dict_data)
-            return data
+        except Exception as e:
+            # Here you can log the exception if needed
+            print(f"An error occurred: {e}")
+            # Optionally, re-raise the exception if you want it to propagate
+            raise
+        finally:
+            # This block ensures that the driver is closed regardless of an exception
+            if driver:
+                driver.quit()
+        return data

uk_bin_collection/uk_bin_collection/councils/SouthGloucestershireCouncil.py CHANGED Viewed

@@ -6,17 +6,16 @@ from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataC
 def format_bin_data(key: str, date: datetime):
     formatted_date = date.strftime(date_format)
-    if re.match(r"^R\d+$", key) is not None:
-        # RX matches both general waste and recycling
-        return [
-            ("General Waste (Black Bin)", formatted_date),
-            ("Recycling & Food Waste", formatted_date),
-        ]
-    elif re.match(r"^G\d+$", key) is not None:
+    servicename = key.get("hso_servicename")
+    print(servicename)
+    if re.match(r"^Recycl", servicename) is not None:
+        return [ ("Recycling", formatted_date) ]
+    elif re.match(r"^Refuse", servicename) is not None:
+        return [("General Waste (Black Bin)", formatted_date)]
+    elif re.match(r"^Garden", servicename) is not None:
         return [("Garden Waste (Green Bin)", formatted_date)]
-    elif re.match(r"^C\d+$", key) is not None:
-        return [("Recycling & Food Waste", formatted_date)]
+    elif re.match(r"^Food", servicename) is not None:
+        return [("Food Waste", formatted_date)]
     else:
         return None
@@ -27,37 +26,34 @@ class CouncilClass(AbstractGetBinDataClass):
         check_uprn(uprn)
         api_url = (
-            f"https://webapps.southglos.gov.uk/Webservices/SGC.RefuseCollectionService/RefuseCollectionService"
-            f".svc/getCollections/{uprn}"
+            f"https://api.southglos.gov.uk/wastecomp/GetCollectionDetails"
+            f"?uprn={uprn}"
         )
         headers = {"content-type": "application/json"}
         response = requests.get(api_url, headers=headers)
-        json_response = json.loads(response.content)
+        json_response = response.json()
         if not json_response:
             raise ValueError("No collection data found for provided UPRN.")
-        collection_data = json_response[0]
+        collection_data = json_response.get('value')
         today = datetime.today()
         eight_weeks = datetime.today() + timedelta(days=8 * 7)
         data = {"bins": []}
         collection_tuple = []
-        for key in collection_data:
-            if key == "CalendarName":
-                continue
-            item = collection_data[key]
+        for collection in collection_data:
+            print(collection)
+            item = collection.get('hso_nextcollection')
             if item == "":
                 continue
-            collection_date = datetime.strptime(item, date_format)
+            collection_date = datetime.fromisoformat(item)
             if today.date() <= collection_date.date() <= eight_weeks.date():
-                bin_data = format_bin_data(key, collection_date)
+                bin_data = format_bin_data(collection, collection_date)
                 if bin_data is not None:
                     for bin_date in bin_data:
                         collection_tuple.append(bin_date)

uk_bin_collection 0.154.0__py3-none-any.whl → 0.158.0__py3-none-any.whl

uk_bin_collection 0.154.0py3-none-any.whl → 0.158.0py3-none-any.whl