edb-noumea 0.3.6__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: edb-noumea
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: Scraper pour la qualité des eaux de baignade à Nouméa.
5
5
  Project-URL: Homepage, https://github.com/adriens/edb-noumea
6
6
  Project-URL: Repository, https://github.com/adriens/edb-noumea
@@ -107,6 +107,32 @@ def get_detailed_results():
107
107
  # It seems to start at index 6 in the camelot df.
108
108
  df = df.iloc[6:].reset_index(drop=True)
109
109
 
110
+ # Apply ffill to all relevant columns after initial column assignment and before specific cleaning
111
+ # Replace empty strings with pd.NA first to ensure ffill works
112
+ for col in df.columns:
113
+ df[col] = df[col].replace('', pd.NA)
114
+
115
+ # Now, forward-fill all columns. This assumes that empty cells in a data block should inherit from the previous non-empty cell.
116
+ df = df.ffill()
117
+
118
+ # Remove trailing informational rows
119
+ # Identify the patterns of the rows to be removed
120
+ unwanted_patterns = [
121
+ "La qualité des eaux de baignade est évaluée au",
122
+ "de présence de germes pathogènes",
123
+ "Suivant l'arrêté N°", # Added to catch more informational rows
124
+ "Indicateurs microbiologiques" # Added to catch more informational rows
125
+ ]
126
+
127
+ # Filter out rows that contain these patterns in the 'Nom du site de baignade' or 'Point de prélèvement' column
128
+ # We check both columns as the content might shift
129
+ initial_row_count = df.shape[0]
130
+ df = df[~df['Nom du site de baignade'].fillna('').astype(str).str.contains('|'.join(unwanted_patterns), case=False, na=False)]
131
+ df = df[~df['Point de prélèvement'].fillna('').astype(str).str.contains('|'.join(unwanted_patterns), case=False, na=False)]
132
+
133
+ if df.shape[0] < initial_row_count:
134
+ print(f"Removed {initial_row_count - df.shape[0]} informational rows.")
135
+
110
136
  except Exception as e:
111
137
  print(f"❌ Une erreur est survenue lors de l'extraction des données du PDF avec Camelot.")
112
138
  print(f" Erreur originale : {e}")
@@ -171,11 +197,11 @@ def get_detailed_results():
171
197
 
172
198
  # Nettoyer et convertir les colonnes e_coli_npp_100ml et enterocoques_npp_100ml
173
199
  if "e_coli_npp_100ml" in cleaned_df.columns:
174
- cleaned_df["e_coli_npp_100ml"] = cleaned_df["e_coli_npp_100ml"].astype(str).str.replace(r"<\s*10", "10", regex=True)
200
+ cleaned_df["e_coli_npp_100ml"] = cleaned_df["e_coli_npp_100ml"].astype(str).str.replace(" ", "", regex=False).str.replace(r"<\s*10", "10", regex=True)
175
201
  cleaned_df["e_coli_npp_100ml"] = pd.to_numeric(cleaned_df["e_coli_npp_100ml"], errors="coerce").astype('Int64')
176
202
 
177
203
  if "enterocoques_npp_100ml" in cleaned_df.columns:
178
- cleaned_df["enterocoques_npp_100ml"] = cleaned_df["enterocoques_npp_100ml"].astype(str).str.replace(r"<\s*10", "10", regex=True)
204
+ cleaned_df["enterocoques_npp_100ml"] = cleaned_df["enterocoques_npp_100ml"].astype(str).str.replace(" ", "", regex=False).str.replace(r"<\s*10", "10", regex=True)
179
205
  cleaned_df["enterocoques_npp_100ml"] = pd.to_numeric(cleaned_df["enterocoques_npp_100ml"], errors="coerce").astype('Int64')
180
206
 
181
207
  # Convertir la colonne 'date' en datetime (format jour/mois/année)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: edb-noumea
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: Scraper pour la qualité des eaux de baignade à Nouméa.
5
5
  Project-URL: Homepage, https://github.com/adriens/edb-noumea
6
6
  Project-URL: Repository, https://github.com/adriens/edb-noumea
@@ -2,7 +2,7 @@
2
2
  name = "edb-noumea"
3
3
  description = "Scraper pour la qualité des eaux de baignade à Nouméa."
4
4
  readme = "README.md"
5
- version = "0.3.6"
5
+ version = "0.3.7"
6
6
  dependencies = [
7
7
  "requests",
8
8
  "beautifulsoup4",
File without changes
File without changes
File without changes