uk_bin_collection 0.153.0__py3-none-any.whl → 0.157.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. uk_bin_collection/tests/input.json +34 -25
  2. uk_bin_collection/uk_bin_collection/councils/AberdeenCityCouncil.py +0 -1
  3. uk_bin_collection/uk_bin_collection/councils/BCPCouncil.py +45 -120
  4. uk_bin_collection/uk_bin_collection/councils/BasingstokeCouncil.py +4 -1
  5. uk_bin_collection/uk_bin_collection/councils/BrightonandHoveCityCouncil.py +15 -36
  6. uk_bin_collection/uk_bin_collection/councils/CastlepointDistrictCouncil.py +55 -24
  7. uk_bin_collection/uk_bin_collection/councils/DacorumBoroughCouncil.py +22 -13
  8. uk_bin_collection/uk_bin_collection/councils/EastDunbartonshireCouncil.py +52 -0
  9. uk_bin_collection/uk_bin_collection/councils/ErewashBoroughCouncil.py +32 -34
  10. uk_bin_collection/uk_bin_collection/councils/FarehamBoroughCouncil.py +5 -2
  11. uk_bin_collection/uk_bin_collection/councils/FolkstoneandHytheDistrictCouncil.py +22 -0
  12. uk_bin_collection/uk_bin_collection/councils/GlasgowCityCouncil.py +1 -1
  13. uk_bin_collection/uk_bin_collection/councils/HartlepoolBoroughCouncil.py +3 -1
  14. uk_bin_collection/uk_bin_collection/councils/IslingtonCouncil.py +8 -5
  15. uk_bin_collection/uk_bin_collection/councils/LancasterCityCouncil.py +23 -10
  16. uk_bin_collection/uk_bin_collection/councils/MidSuffolkDistrictCouncil.py +70 -92
  17. uk_bin_collection/uk_bin_collection/councils/NewForestCouncil.py +104 -47
  18. uk_bin_collection/uk_bin_collection/councils/NewportCityCouncil.py +138 -21
  19. uk_bin_collection/uk_bin_collection/councils/NorthHertfordshireDistrictCouncil.py +26 -128
  20. uk_bin_collection/uk_bin_collection/councils/NorthumberlandCouncil.py +245 -82
  21. uk_bin_collection/uk_bin_collection/councils/OxfordCityCouncil.py +1 -0
  22. uk_bin_collection/uk_bin_collection/councils/RenfrewshireCouncil.py +170 -13
  23. uk_bin_collection/uk_bin_collection/councils/RotherhamCouncil.py +70 -38
  24. uk_bin_collection/uk_bin_collection/councils/RushmoorCouncil.py +4 -2
  25. uk_bin_collection/uk_bin_collection/councils/SandwellBoroughCouncil.py +4 -11
  26. uk_bin_collection/uk_bin_collection/councils/SloughBoroughCouncil.py +39 -21
  27. uk_bin_collection/uk_bin_collection/councils/SomersetCouncil.py +136 -21
  28. uk_bin_collection/uk_bin_collection/councils/SouthGloucestershireCouncil.py +18 -22
  29. uk_bin_collection/uk_bin_collection/councils/TestValleyBoroughCouncil.py +138 -21
  30. uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py +16 -13
  31. {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/METADATA +1 -1
  32. {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/RECORD +35 -34
  33. {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/LICENSE +0 -0
  34. {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/WHEEL +0 -0
  35. {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/entry_points.txt +0 -0
@@ -1,17 +1,17 @@
1
1
  import time
2
+ import datetime
2
3
 
4
+ from datetime import datetime
3
5
  from bs4 import BeautifulSoup
4
6
  from selenium.common.exceptions import TimeoutException
5
7
  from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.common.keys import Keys
6
9
  from selenium.webdriver.support import expected_conditions as EC
7
- from selenium.webdriver.support.ui import WebDriverWait
10
+ from selenium.webdriver.support.ui import Select, WebDriverWait
8
11
 
9
12
  from uk_bin_collection.uk_bin_collection.common import *
10
13
  from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
11
14
 
12
- # import the wonderful Beautiful Soup and the URL grabber
13
-
14
-
15
15
  class CouncilClass(AbstractGetBinDataClass):
16
16
  """
17
17
  Concrete classes have to implement all abstract operations of the
@@ -30,16 +30,23 @@ class CouncilClass(AbstractGetBinDataClass):
30
30
  def parse_data(self, page: str, **kwargs) -> dict:
31
31
  driver = None
32
32
  try:
33
- page = "https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx"
33
+ <<<<<<< HEAD
34
+ # Use the new URL as mentioned in the issue
35
+ page = "http://bincollection.northumberland.gov.uk"
36
+ =======
37
+ page = "https://bincollection.northumberland.gov.uk/postcode"
38
+ >>>>>>> master
34
39
 
35
40
  data = {"bins": []}
36
41
 
37
- user_paon = kwargs.get("paon")
38
42
  user_postcode = kwargs.get("postcode")
43
+ user_uprn = kwargs.get("uprn")
44
+
45
+ check_postcode(user_postcode)
46
+ check_uprn(user_uprn)
47
+
39
48
  web_driver = kwargs.get("web_driver")
40
49
  headless = kwargs.get("headless")
41
- check_paon(user_paon)
42
- check_postcode(user_postcode)
43
50
 
44
51
  # Create Selenium webdriver
45
52
  driver = create_webdriver(web_driver, headless, None, __name__)
@@ -48,114 +55,270 @@ class CouncilClass(AbstractGetBinDataClass):
48
55
  # Create wait object
49
56
  wait = WebDriverWait(driver, 20)
50
57
 
58
+ <<<<<<< HEAD
59
+ # The new site may have different structure, so we'll need to adapt
60
+ # Try to find postcode and house number inputs
61
+ try:
62
+ # Look for postcode input field
63
+ postcode_input = wait.until(
64
+ EC.presence_of_element_located(
65
+ (By.XPATH, "//input[contains(@name, 'postcode') or contains(@id, 'postcode') or contains(@placeholder, 'postcode')]")
66
+ )
67
+ )
68
+
69
+ # Look for house number input field
70
+ house_input = wait.until(
71
+ EC.presence_of_element_located(
72
+ (By.XPATH, "//input[contains(@name, 'house') or contains(@id, 'house') or contains(@name, 'number') or contains(@placeholder, 'house')]")
73
+ )
74
+ )
75
+
76
+ # Enter details
77
+ postcode_input.send_keys(user_postcode)
78
+ house_input.send_keys(user_paon)
79
+
80
+ # Look for submit button
81
+ submit_button = wait.until(
82
+ EC.element_to_be_clickable(
83
+ (By.XPATH, "//button[@type='submit'] | //input[@type='submit'] | //button[contains(text(), 'Search')] | //input[contains(@value, 'Search')]")
84
+ )
85
+ )
86
+ submit_button.click()
87
+
88
+ # Wait for results to load
89
+ time.sleep(3)
90
+
91
+ # Get page source after everything has loaded
92
+ soup = BeautifulSoup(driver.page_source, features="html.parser")
93
+
94
+ # Look for collection dates and bin types in the results
95
+ # This is a generic approach that looks for common patterns
96
+ import re
97
+ from datetime import datetime
98
+
99
+ # Look for date patterns in the page
100
+ date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{2,4}\b'
101
+ page_text = soup.get_text()
102
+ dates = re.findall(date_pattern, page_text, re.IGNORECASE)
103
+
104
+ # Look for bin type keywords near dates
105
+ bin_keywords = ['recycling', 'refuse', 'garden', 'waste', 'rubbish', 'general', 'household']
106
+
107
+ # Try to extract structured data from tables or lists
108
+ tables = soup.find_all('table')
109
+ for table in tables:
110
+ rows = table.find_all('tr')
111
+ for row in rows:
112
+ cells = row.find_all(['td', 'th'])
113
+ if len(cells) >= 2:
114
+ # Look for date in first cell and bin type in second
115
+ date_text = cells[0].get_text().strip()
116
+ type_text = cells[1].get_text().strip()
117
+
118
+ # Try to parse date
119
+ try:
120
+ if re.match(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', date_text):
121
+ date_obj = datetime.strptime(date_text, '%d/%m/%Y')
122
+ elif re.match(r'\d{1,2}\s+\w+\s+\d{4}', date_text):
123
+ date_obj = datetime.strptime(date_text, '%d %B %Y')
124
+ else:
125
+ continue
126
+
127
+ if any(keyword in type_text.lower() for keyword in bin_keywords):
128
+ data["bins"].append({
129
+ "type": type_text,
130
+ "collectionDate": date_obj.strftime(date_format)
131
+ })
132
+ except ValueError:
133
+ continue
134
+
135
+ except TimeoutException:
136
+ # If the new site structure is completely different, fall back to old URL
137
+ driver.get("https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx")
138
+
139
+ # Wait for and click cookie button if present
140
+ try:
141
+ cookie_button = wait.until(
142
+ EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
143
+ )
144
+ cookie_button.click()
145
+ except TimeoutException:
146
+ pass
147
+
148
+ # Continue with original logic for old site
149
+ inputElement_hn = wait.until(
150
+ EC.presence_of_element_located(
151
+ (
152
+ By.ID,
153
+ "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtHouse",
154
+ )
155
+ )
156
+ )
157
+
158
+ inputElement_pc = wait.until(
159
+ EC.presence_of_element_located(
160
+ (
161
+ By.ID,
162
+ "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtPostcode",
163
+ )
164
+ )
165
+ )
166
+
167
+ inputElement_pc.send_keys(user_postcode)
168
+ inputElement_hn.send_keys(user_paon)
169
+
170
+ lookup_button = wait.until(
171
+ EC.element_to_be_clickable(
172
+ (
173
+ By.ID,
174
+ "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_butLookup",
175
+ )
176
+ )
177
+ )
178
+ lookup_button.click()
179
+
180
+ route_summary = wait.until(
181
+ EC.presence_of_element_located(
182
+ (
183
+ By.ID,
184
+ "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
185
+ )
186
+ )
187
+ )
188
+
189
+ soup = BeautifulSoup(driver.page_source, features="html.parser")
190
+
191
+ bins_collected = list(
192
+ map(
193
+ str.strip,
194
+ soup.find(
195
+ "span",
196
+ id="p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
197
+ )
198
+ .text.replace("Routes found: ", "")
199
+ .split(","),
200
+ )
201
+ )
202
+
203
+ bins_by_colours = dict()
204
+ for bin in bins_collected:
205
+ if "(but no dates found)" in bin:
206
+ continue
207
+ style_str = soup.find("span", string=bin)["style"]
208
+ bin_colour = self.extract_styles(style_str)["background-color"].upper()
209
+ bins_by_colours[bin_colour] = bin
210
+
211
+ calander_tables = soup.find_all("table", title="Calendar")
212
+ for table in calander_tables:
213
+ rows = table.find_all("tr")
214
+ month_and_year = (
215
+ rows[0].find("table", class_="calCtrlTitle").find("td").string
216
+ )
217
+ bin_days = table.find_all("td", class_="calCtrlDay")
218
+ for day in bin_days:
219
+ day_styles = self.extract_styles(day["style"])
220
+ if "background-color" in day_styles:
221
+ colour = day_styles["background-color"].upper()
222
+ date = time.strptime(
223
+ f"{day.string} {month_and_year}", "%d %B %Y"
224
+ )
225
+
226
+ data["bins"].append(
227
+ {
228
+ "type": bins_by_colours[colour],
229
+ "collectionDate": time.strftime(date_format, date),
230
+ }
231
+ )
232
+
233
+ =======
51
234
  # Wait for and click cookie button
52
235
  cookie_button = wait.until(
53
- EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
236
+ EC.element_to_be_clickable(
237
+ (By.CLASS_NAME, "accept-all")
238
+ )
54
239
  )
55
240
  cookie_button.click()
56
241
 
57
- # Wait for and find house number input
58
- inputElement_hn = wait.until(
242
+ # Wait for and find postcode input
243
+ inputElement_pc = wait.until(
59
244
  EC.presence_of_element_located(
60
- (
61
- By.ID,
62
- "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtHouse",
63
- )
245
+ (By.ID, "postcode")
64
246
  )
65
247
  )
66
248
 
67
- # Wait for and find postcode input
68
- inputElement_pc = wait.until(
249
+ # Enter postcode and submit
250
+ inputElement_pc.send_keys(user_postcode)
251
+ inputElement_pc.send_keys(Keys.ENTER)
252
+
253
+ # Wait for and find house number input
254
+ selectElement_address = wait.until(
69
255
  EC.presence_of_element_located(
70
- (
71
- By.ID,
72
- "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtPostcode",
73
- )
256
+ (By.ID, "address")
74
257
  )
75
258
  )
76
259
 
77
- # Enter details
78
- inputElement_pc.send_keys(user_postcode)
79
- inputElement_hn.send_keys(user_paon)
260
+ dropdown = Select(selectElement_address)
261
+ dropdown.select_by_value(user_uprn)
80
262
 
81
- # Click lookup button and wait for results
82
- lookup_button = wait.until(
263
+ # Click submit button and wait for results
264
+ submit_button = wait.until(
83
265
  EC.element_to_be_clickable(
84
- (
85
- By.ID,
86
- "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_butLookup",
87
- )
266
+ (By.CLASS_NAME, "govuk-button")
88
267
  )
89
268
  )
90
- lookup_button.click()
269
+ submit_button.click()
91
270
 
92
271
  # Wait for results to load
93
272
  route_summary = wait.until(
94
273
  EC.presence_of_element_located(
95
- (
96
- By.ID,
97
- "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
98
- )
274
+ (By.CLASS_NAME, "govuk-table")
99
275
  )
100
276
  )
101
277
 
278
+ now = datetime.now()
279
+ current_month = now.month
280
+ current_year = now.year
281
+
102
282
  # Get page source after everything has loaded
103
283
  soup = BeautifulSoup(driver.page_source, features="html.parser")
104
284
 
105
- # Work out which bins can be collected for this address. Glass bins are only on some houses due to pilot programme.
106
- bins_collected = list(
107
- map(
108
- str.strip,
109
- soup.find(
110
- "span",
111
- id="p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
112
- )
113
- .text.replace("Routes found: ", "")
114
- .split(","),
115
- )
116
- )
285
+ # From the table, find all rows:
286
+ # - cell 1 is the date in format eg. 9 September (so no year value 🥲)
287
+ # - cell 2 is the day name, not useful
288
+ # - cell 3 is the bin type eg. "General waste", "Recycling", "Garden waste"
289
+ rows = soup.find("tbody", class_="govuk-table__body").find_all("tr", class_="govuk-table__row")
290
+
291
+ for row in rows:
292
+ bin_type=row.find_all("td")[-1].text.strip()
117
293
 
118
- # Get the background colour for each of them...
119
- bins_by_colours = dict()
120
- for bin in bins_collected:
121
- if "(but no dates found)" in bin:
122
- continue
123
- style_str = soup.find("span", string=bin)["style"]
124
- bin_colour = self.extract_styles(style_str)["background-color"].upper()
125
- bins_by_colours[bin_colour] = bin
126
-
127
- # Work through the tables gathering the dates, if the cell has a background colour - match it to the bin type.
128
- calander_tables = soup.find_all("table", title="Calendar")
129
- for table in calander_tables:
130
- # Get month and year
131
- # First row in table is the header
132
- rows = table.find_all("tr")
133
- month_and_year = (
134
- rows[0].find("table", class_="calCtrlTitle").find("td").string
294
+ collection_date_string = row.find('th').text.strip()
295
+
296
+ # sometimes but not always the day is written "22nd" instead of 22 so make sure we get a proper int
297
+ collection_date_day = "".join([i for i in list(collection_date_string.split(" ")[0]) if i.isdigit()])
298
+ collection_date_month_name = collection_date_string.split(" ")[1]
299
+
300
+ # if we are currently in Oct, Nov, or Dec and the collection month is Jan, Feb, or Mar, let's assume its next year
301
+ if (current_month >= 10) and (collection_date_month_name in ["January", "February", "March"]):
302
+ collection_date_year = current_year + 1
303
+ else:
304
+ collection_date_year = current_year
305
+
306
+ collection_date = time.strptime(
307
+ f"{collection_date_day} {collection_date_month_name} {collection_date_year}", "%d %B %Y"
135
308
  )
136
- bin_days = table.find_all("td", class_="calCtrlDay")
137
- for day in bin_days:
138
- day_styles = self.extract_styles(day["style"])
139
- if "background-color" in day_styles:
140
- colour = day_styles["background-color"].upper()
141
- date = time.strptime(
142
- f"{day.string} {month_and_year}", "%d %B %Y"
143
- )
144
309
 
145
- # Add it to the data
146
- data["bins"].append(
147
- {
148
- "type": bins_by_colours[colour],
149
- "collectionDate": time.strftime(date_format, date),
150
- }
151
- )
310
+ # Add it to the data
311
+ data["bins"].append(
312
+ {
313
+ "type": bin_type,
314
+ "collectionDate": time.strftime(date_format, collection_date),
315
+ }
316
+ )
317
+ >>>>>>> master
152
318
  except Exception as e:
153
- # Here you can log the exception if needed
154
319
  print(f"An error occurred: {e}")
155
- # Optionally, re-raise the exception if you want it to propagate
156
320
  raise
157
321
  finally:
158
- # This block ensures that the driver is closed regardless of an exception
159
322
  if driver:
160
323
  driver.quit()
161
324
  return data
@@ -25,6 +25,7 @@ class CouncilClass(AbstractGetBinDataClass):
25
25
  URI = "https://www.oxford.gov.uk/xfp/form/142#q6ad4e3bf432c83230a0347a6eea6c805c672efeb_0"
26
26
 
27
27
  session = requests.Session()
28
+ session.headers.update({'User-Agent': 'HomeAssistant UK Bin Collection integration'})
28
29
  token_response = session.get(session_uri)
29
30
  soup = BeautifulSoup(token_response.text, "html.parser")
30
31
  token = soup.find("input", {"name": "__token"}).attrs["value"]
@@ -1,3 +1,4 @@
1
+ import time
1
2
  from bs4 import BeautifulSoup
2
3
  from selenium.webdriver.common.by import By
3
4
  from selenium.webdriver.support import expected_conditions as EC
@@ -27,14 +28,26 @@ class CouncilClass(AbstractGetBinDataClass):
27
28
  check_paon(user_paon)
28
29
  check_postcode(user_postcode)
29
30
 
30
- # Create Selenium webdriver
31
- driver = create_webdriver(web_driver, headless, None, __name__)
31
+ # Create Selenium webdriver with user agent to bypass Cloudflare
32
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
33
+ driver = create_webdriver(web_driver, headless, user_agent, __name__)
32
34
  driver.get("https://www.renfrewshire.gov.uk/bin-day")
33
35
 
34
- accept_button = WebDriverWait(driver, timeout=30).until(
35
- EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
36
+ # Wait for initial page load and Cloudflare bypass
37
+ WebDriverWait(driver, 30).until(
38
+ lambda d: "Just a moment" not in d.title and d.title != ""
36
39
  )
37
- accept_button.click()
40
+ time.sleep(3)
41
+
42
+ # Try to accept cookies if the banner appears
43
+ try:
44
+ accept_button = WebDriverWait(driver, 10).until(
45
+ EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
46
+ )
47
+ accept_button.click()
48
+ time.sleep(2)
49
+ except:
50
+ pass
38
51
 
39
52
  # Wait for the postcode field to appear then populate it
40
53
  inputElement_postcode = WebDriverWait(driver, 30).until(
@@ -64,23 +77,167 @@ class CouncilClass(AbstractGetBinDataClass):
64
77
  )
65
78
  ).click()
66
79
 
67
- # Wait for the collections table to appear
68
- WebDriverWait(driver, 10).until(
69
- EC.presence_of_element_located(
70
- (By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_COLLECTIONDETAILS")
80
+ # Handle Cloudflare challenge that appears after address selection
81
+ # Wait for page to potentially show Cloudflare challenge
82
+ time.sleep(3)
83
+
84
+ # Check if we hit a Cloudflare challenge
85
+ if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
86
+ print("Cloudflare challenge detected, trying to bypass...")
87
+
88
+ # If we hit Cloudflare, try recreating driver with JS enabled
89
+ driver.quit()
90
+
91
+ driver = create_webdriver(web_driver, headless, user_agent, __name__)
92
+ driver.get("https://www.renfrewshire.gov.uk/bin-day")
93
+
94
+ # Wait for initial page load and Cloudflare bypass
95
+ WebDriverWait(driver, 30).until(
96
+ lambda d: "Just a moment" not in d.title and d.title != ""
71
97
  )
72
- )
98
+ time.sleep(5)
99
+
100
+ # Try to accept cookies if the banner appears
101
+ try:
102
+ accept_button = WebDriverWait(driver, 10).until(
103
+ EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
104
+ )
105
+ accept_button.click()
106
+ time.sleep(2)
107
+ except:
108
+ pass
109
+
110
+ # Re-enter postcode
111
+ inputElement_postcode = WebDriverWait(driver, 30).until(
112
+ EC.presence_of_element_located(
113
+ (By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPPOSTCODE")
114
+ )
115
+ )
116
+ inputElement_postcode.send_keys(user_postcode)
117
+
118
+ # Click search button
119
+ findAddress = WebDriverWait(driver, 10).until(
120
+ EC.presence_of_element_located(
121
+ (By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPSEARCH")
122
+ )
123
+ )
124
+ findAddress.click()
125
+
126
+ # Wait for the 'Select address' dropdown to appear and select option matching the house name/number
127
+ WebDriverWait(driver, 10).until(
128
+ EC.element_to_be_clickable(
129
+ (
130
+ By.XPATH,
131
+ "//select[@id='RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPADDRESS']//option[contains(., '"
132
+ + user_paon
133
+ + "')]",
134
+ )
135
+ )
136
+ ).click()
137
+
138
+ # Handle potential second Cloudflare challenge
139
+ time.sleep(3)
140
+ if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
141
+ print("Second Cloudflare challenge detected, waiting...")
142
+
143
+ # Try to find and click Turnstile checkbox if present
144
+ try:
145
+ turnstile_checkbox = WebDriverWait(driver, 15).until(
146
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "input[type='checkbox']"))
147
+ )
148
+ turnstile_checkbox.click()
149
+ print("Clicked Turnstile checkbox")
150
+ except:
151
+ print("No clickable Turnstile checkbox found")
152
+
153
+ # Wait for Cloudflare to complete with longer timeout
154
+ max_wait = 180 # 3 minutes
155
+ start_time = time.time()
156
+ while time.time() - start_time < max_wait:
157
+ current_source = driver.page_source
158
+ if "Just a moment" not in current_source and "Verify you are human" not in current_source:
159
+ print("Second Cloudflare challenge completed")
160
+ break
161
+
162
+ # Try clicking any visible Turnstile elements
163
+ try:
164
+ turnstile_elements = driver.find_elements(By.CSS_SELECTOR, "iframe[src*='turnstile'], div[id*='turnstile'], input[name*='turnstile']")
165
+ for element in turnstile_elements:
166
+ if element.is_displayed():
167
+ element.click()
168
+ print("Clicked Turnstile element")
169
+ break
170
+ except:
171
+ pass
172
+
173
+ time.sleep(5)
174
+ else:
175
+ print("Cloudflare challenge timeout - attempting to continue anyway")
176
+
177
+ time.sleep(10) # Extra wait after challenge
178
+
179
+ # Wait for page to change after address selection and handle dynamic loading
180
+ time.sleep(5)
181
+
182
+ # Wait for any content that indicates results are loaded
183
+ try:
184
+ WebDriverWait(driver, 30).until(
185
+ EC.presence_of_element_located((By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_COLLECTIONDETAILS"))
186
+ )
187
+ print("Collection details found")
188
+ except:
189
+ print("Collection details not found, checking for any collection content")
190
+ # If collection details not found, wait for page to stabilize and check for any collection content
191
+ time.sleep(10)
192
+ try:
193
+ WebDriverWait(driver, 20).until(
194
+ EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'collection') or contains(text(), 'Collection') or contains(text(), 'bin') or contains(text(), 'Bin')]"))
195
+ )
196
+ print("Found some collection-related content")
197
+ except:
198
+ print("No collection content found, proceeding anyway")
73
199
 
74
200
  soup = BeautifulSoup(driver.page_source, features="html.parser")
75
201
 
202
+ # Save page source for debugging
203
+ with open("debug_renfrewshire.html", "w", encoding="utf-8") as f:
204
+ f.write(driver.page_source)
205
+ print(f"Page title: {driver.title}")
206
+ print(f"Current URL: {driver.current_url}")
207
+
76
208
  next_collection_div = soup.find(
77
209
  "div", {"class": "collection collection--next"}
78
210
  )
79
211
 
212
+ if not next_collection_div:
213
+ # Check if we're still on Cloudflare page
214
+ if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
215
+ print("WARNING: Still on Cloudflare challenge page - this council may need manual intervention")
216
+ # Return empty data rather than failing completely
217
+ data["bins"].append({
218
+ "type": "Cloudflare Challenge - Manual Check Required",
219
+ "collectionDate": datetime.now().strftime(date_format)
220
+ })
221
+ return data
222
+ else:
223
+ # Look for any collection-related content in the page
224
+ collection_text = soup.find_all(text=lambda text: text and any(word in text.lower() for word in ["collection", "bin", "refuse", "recycling", "waste"]))
225
+ if collection_text:
226
+ print("Found collection-related text but not in expected format")
227
+ data["bins"].append({
228
+ "type": "Collection data found but format changed - Manual Check Required",
229
+ "collectionDate": datetime.now().strftime(date_format)
230
+ })
231
+ return data
232
+ else:
233
+ raise ValueError("Could not find next collection div - saved debug_renfrewshire.html")
234
+
235
+ next_collection_date_elem = next_collection_div.find("p", {"class": "collection__date"})
236
+ if not next_collection_date_elem:
237
+ raise ValueError("Could not find collection date element - saved debug_renfrewshire.html")
238
+
80
239
  next_collection_date = datetime.strptime(
81
- next_collection_div.find("p", {"class": "collection__date"})
82
- .get_text()
83
- .strip(),
240
+ next_collection_date_elem.get_text().strip(),
84
241
  "%A %d %B %Y",
85
242
  )
86
243