giga-spatial 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/METADATA +30 -4
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/RECORD +22 -20
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +1 -0
- gigaspatial/core/io/adls_data_store.py +104 -11
- gigaspatial/core/io/local_data_store.py +8 -0
- gigaspatial/generators/poi.py +226 -82
- gigaspatial/generators/zonal/base.py +41 -28
- gigaspatial/generators/zonal/geometry.py +91 -41
- gigaspatial/grid/h3.py +417 -0
- gigaspatial/grid/mercator_tiles.py +1 -1
- gigaspatial/handlers/base.py +22 -8
- gigaspatial/handlers/ghsl.py +22 -8
- gigaspatial/handlers/giga.py +9 -4
- gigaspatial/handlers/healthsites.py +350 -0
- gigaspatial/handlers/osm.py +325 -105
- gigaspatial/handlers/worldpop.py +228 -9
- gigaspatial/processing/geo.py +11 -6
- gigaspatial/processing/tif_processor.py +1183 -496
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/top_level.txt +0 -0
gigaspatial/handlers/osm.py
CHANGED
@@ -8,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
8
8
|
from requests.exceptions import RequestException
|
9
9
|
from shapely.geometry import Polygon, Point
|
10
10
|
import pycountry
|
11
|
+
from datetime import datetime
|
11
12
|
|
12
13
|
from gigaspatial.config import config
|
13
14
|
|
@@ -112,56 +113,100 @@ class OSMLocationFetcher:
|
|
112
113
|
names.append(name)
|
113
114
|
return sorted(set(names))
|
114
115
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
116
|
+
@staticmethod
|
117
|
+
def get_osm_countries(
|
118
|
+
iso3_code: Optional[str] = None, include_names: bool = True, timeout: int = 1000
|
119
|
+
) -> Union[str, Dict[str, str], List[str], List[Dict[str, str]]]:
|
119
120
|
"""
|
120
|
-
|
121
|
-
date_filter = f'(newer:"{since_year}-01-01T00:00:00Z")'
|
122
|
-
else:
|
123
|
-
date_filter = ""
|
121
|
+
Fetch countries from OpenStreetMap database.
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
for category, types in self.location_types.items():
|
128
|
-
nodes_relations_queries.extend(
|
129
|
-
[
|
130
|
-
f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
|
131
|
-
f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
|
132
|
-
]
|
133
|
-
)
|
123
|
+
This queries the actual OSM database for country boundaries and returns
|
124
|
+
country names as they appear in OSM, including various name translations.
|
134
125
|
|
135
|
-
|
126
|
+
Args:
|
127
|
+
iso3_code (str, optional): ISO 3166-1 alpha-3 code to fetch a specific country.
|
128
|
+
If provided, returns single country data.
|
129
|
+
If None, returns all countries.
|
130
|
+
include_names (bool): If True, return dict with multiple name variants.
|
131
|
+
If False, return only the primary name.
|
132
|
+
timeout (int): Timeout for the Overpass API request (default: 1000).
|
136
133
|
|
137
|
-
|
138
|
-
|
139
|
-
|
134
|
+
Returns:
|
135
|
+
When iso3_code is provided:
|
136
|
+
- If include_names=False: Single country name (str)
|
137
|
+
- If include_names=True: Dict with name variants
|
138
|
+
When iso3_code is None:
|
139
|
+
- If include_names=False: List of country names
|
140
|
+
- If include_names=True: List of dicts with name variants including:
|
141
|
+
name, name:en, ISO3166-1 codes, and other name translations
|
142
|
+
|
143
|
+
Raises:
|
144
|
+
ValueError: If iso3_code is provided but country not found in OSM.
|
145
|
+
"""
|
146
|
+
if iso3_code:
|
147
|
+
# Filter for the specific ISO3 code provided
|
148
|
+
iso3_upper = iso3_code.upper()
|
149
|
+
country_filter = f'["ISO3166-1:alpha3"="{iso3_upper}"]'
|
150
|
+
else:
|
151
|
+
# Filter for the *existence* of an ISO3 code tag to limit results to actual countries
|
152
|
+
country_filter = '["ISO3166-1:alpha3"]'
|
153
|
+
|
154
|
+
# Query OSM for country-level boundaries
|
155
|
+
query = f"""
|
156
|
+
[out:json][timeout:{timeout}];
|
140
157
|
(
|
141
|
-
|
158
|
+
relation["boundary"="administrative"]["admin_level"="2"]{country_filter};
|
142
159
|
);
|
143
|
-
out
|
160
|
+
out tags;
|
144
161
|
"""
|
145
162
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
|
151
|
-
)
|
163
|
+
url = "http://overpass-api.de/api/interpreter"
|
164
|
+
response = requests.get(url, params={"data": query}, timeout=timeout)
|
165
|
+
response.raise_for_status()
|
166
|
+
data = response.json()
|
152
167
|
|
153
|
-
|
168
|
+
countries = []
|
169
|
+
for element in data.get("elements", []):
|
170
|
+
tags = element.get("tags", {})
|
154
171
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
172
|
+
if include_names:
|
173
|
+
country_info = {
|
174
|
+
"name": tags.get("name", ""),
|
175
|
+
"name:en": tags.get("name:en", ""),
|
176
|
+
"official_name": tags.get("official_name", ""),
|
177
|
+
"official_name:en": tags.get("official_name:en", ""),
|
178
|
+
"ISO3166-1": tags.get("ISO3166-1", ""),
|
179
|
+
"ISO3166-1:alpha2": tags.get("ISO3166-1:alpha2", ""),
|
180
|
+
"ISO3166-1:alpha3": tags.get("ISO3166-1:alpha3", ""),
|
181
|
+
}
|
182
|
+
|
183
|
+
# Add any other name:* tags (translations)
|
184
|
+
for key, value in tags.items():
|
185
|
+
if key.startswith("name:") and key not in country_info:
|
186
|
+
country_info[key] = value
|
187
|
+
|
188
|
+
# Remove empty string values
|
189
|
+
country_info = {k: v for k, v in country_info.items() if v}
|
190
|
+
|
191
|
+
if country_info.get("name"): # Only add if has a name
|
192
|
+
countries.append(country_info)
|
193
|
+
else:
|
194
|
+
name = tags.get("name")
|
195
|
+
if name:
|
196
|
+
countries.append(name)
|
197
|
+
|
198
|
+
# If looking for a specific country, return single result or raise error
|
199
|
+
if iso3_code:
|
200
|
+
if not countries:
|
201
|
+
raise ValueError(
|
202
|
+
f"Country with ISO3 code '{iso3_code}' not found in OSM database"
|
203
|
+
)
|
204
|
+
return countries[0] # Return single country, not a list
|
163
205
|
|
164
|
-
|
206
|
+
# Return sorted list for all countries
|
207
|
+
return sorted(
|
208
|
+
countries, key=lambda x: x if isinstance(x, str) else x.get("name", "")
|
209
|
+
)
|
165
210
|
|
166
211
|
def _make_request(self, query: str) -> Dict:
|
167
212
|
"""Make HTTP request to Overpass API with retry mechanism."""
|
@@ -213,23 +258,33 @@ class OSMLocationFetcher:
|
|
213
258
|
_lon = element.get("lon") or element["center"]["lon"]
|
214
259
|
point_geom = Point(_lon, _lat)
|
215
260
|
|
216
|
-
#
|
261
|
+
# Extract metadata if available
|
262
|
+
metadata = {}
|
263
|
+
if "timestamp" in element:
|
264
|
+
metadata["timestamp"] = element["timestamp"]
|
265
|
+
metadata["version"] = element.get("version")
|
266
|
+
metadata["changeset"] = element.get("changeset")
|
267
|
+
metadata["user"] = element.get("user")
|
268
|
+
metadata["uid"] = element.get("uid")
|
269
|
+
|
270
|
+
# For each matching category, create a separate element
|
217
271
|
results = []
|
218
272
|
for category, value in matching_categories.items():
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
)
|
273
|
+
result = {
|
274
|
+
"source_id": element["id"],
|
275
|
+
"category": category,
|
276
|
+
"category_value": value,
|
277
|
+
"name": tags.get("name", ""),
|
278
|
+
"name_en": tags.get("name:en", ""),
|
279
|
+
"type": element["type"],
|
280
|
+
"geometry": point_geom,
|
281
|
+
"latitude": _lat,
|
282
|
+
"longitude": _lon,
|
283
|
+
"matching_categories": list(matching_categories.keys()),
|
284
|
+
}
|
285
|
+
# Add metadata if available
|
286
|
+
result.update(metadata)
|
287
|
+
results.append(result)
|
233
288
|
|
234
289
|
return results
|
235
290
|
|
@@ -256,36 +311,121 @@ class OSMLocationFetcher:
|
|
256
311
|
polygon = Polygon([(p["lon"], p["lat"]) for p in element["geometry"]])
|
257
312
|
centroid = polygon.centroid
|
258
313
|
|
314
|
+
# Extract metadata if available
|
315
|
+
metadata = {}
|
316
|
+
if "timestamp" in element:
|
317
|
+
metadata["timestamp"] = element["timestamp"]
|
318
|
+
metadata["version"] = element.get("version")
|
319
|
+
metadata["changeset"] = element.get("changeset")
|
320
|
+
metadata["user"] = element.get("user")
|
321
|
+
metadata["uid"] = element.get("uid")
|
322
|
+
|
259
323
|
# For each matching category, create a separate element
|
260
324
|
results = []
|
261
325
|
for category, value in matching_categories.items():
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
)
|
326
|
+
result = {
|
327
|
+
"source_id": element["id"],
|
328
|
+
"category": category,
|
329
|
+
"category_value": value,
|
330
|
+
"name": tags.get("name", ""),
|
331
|
+
"name_en": tags.get("name:en", ""),
|
332
|
+
"type": element["type"],
|
333
|
+
"geometry": polygon,
|
334
|
+
"latitude": centroid.y,
|
335
|
+
"longitude": centroid.x,
|
336
|
+
"matching_categories": list(matching_categories.keys()),
|
337
|
+
}
|
338
|
+
# Add metadata if available
|
339
|
+
result.update(metadata)
|
340
|
+
results.append(result)
|
276
341
|
|
277
342
|
return results
|
278
343
|
except (KeyError, ValueError) as e:
|
279
344
|
self.logger.error(f"Error processing way geometry: {str(e)}")
|
280
345
|
return []
|
281
346
|
|
347
|
+
def _build_queries(
|
348
|
+
self,
|
349
|
+
date_filter_type: Optional[Literal["newer", "changed"]] = None,
|
350
|
+
start_date: Optional[str] = None,
|
351
|
+
end_date: Optional[str] = None,
|
352
|
+
include_metadata: bool = False,
|
353
|
+
) -> List[str]:
|
354
|
+
"""
|
355
|
+
Construct Overpass QL queries with optional date filtering and metadata.
|
356
|
+
|
357
|
+
Args:
|
358
|
+
date_filter_type: Type of date filter ('newer' or 'changed')
|
359
|
+
start_date: Start date in ISO 8601 format
|
360
|
+
end_date: End date in ISO 8601 format (required for 'changed')
|
361
|
+
include_metadata: If True, include change metadata (timestamp, version, changeset, user)
|
362
|
+
|
363
|
+
Returns:
|
364
|
+
List[str]: List of [nodes_relations_query, ways_query]
|
365
|
+
"""
|
366
|
+
# Build the date filter based on type
|
367
|
+
if date_filter_type == "newer" and start_date:
|
368
|
+
date_filter = f'(newer:"{start_date}")'
|
369
|
+
elif date_filter_type == "changed" and start_date and end_date:
|
370
|
+
date_filter = f'(changed:"{start_date}","{end_date}")'
|
371
|
+
else:
|
372
|
+
date_filter = ""
|
373
|
+
|
374
|
+
# Determine output mode
|
375
|
+
output_mode = "center meta" if include_metadata else "center"
|
376
|
+
output_mode_geom = "geom meta" if include_metadata else "geom"
|
377
|
+
|
378
|
+
# Query for nodes and relations
|
379
|
+
nodes_relations_queries = []
|
380
|
+
for category, types in self.location_types.items():
|
381
|
+
nodes_relations_queries.extend(
|
382
|
+
[
|
383
|
+
f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
|
384
|
+
f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
|
385
|
+
]
|
386
|
+
)
|
387
|
+
|
388
|
+
nodes_relations_queries = "\n".join(nodes_relations_queries)
|
389
|
+
|
390
|
+
nodes_relations_query = f"""
|
391
|
+
[out:json][timeout:{self.timeout}];
|
392
|
+
{self.area_query}
|
393
|
+
(
|
394
|
+
{nodes_relations_queries}
|
395
|
+
);
|
396
|
+
out {output_mode};
|
397
|
+
"""
|
398
|
+
|
399
|
+
# Query for ways
|
400
|
+
ways_queries = []
|
401
|
+
for category, types in self.location_types.items():
|
402
|
+
ways_queries.append(
|
403
|
+
f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
|
404
|
+
)
|
405
|
+
|
406
|
+
ways_queries = "\n".join(ways_queries)
|
407
|
+
|
408
|
+
ways_query = f"""
|
409
|
+
[out:json][timeout:{self.timeout}];
|
410
|
+
{self.area_query}
|
411
|
+
(
|
412
|
+
{ways_queries}
|
413
|
+
);
|
414
|
+
out {output_mode_geom};
|
415
|
+
"""
|
416
|
+
|
417
|
+
return [nodes_relations_query, ways_query]
|
418
|
+
|
282
419
|
def fetch_locations(
|
283
420
|
self,
|
284
|
-
|
421
|
+
since_date: Optional[Union[str, datetime]] = None,
|
285
422
|
handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
|
423
|
+
include_metadata: bool = False,
|
286
424
|
) -> pd.DataFrame:
|
287
425
|
"""
|
288
|
-
Fetch
|
426
|
+
Fetch OSM locations, optionally filtered by 'since' date.
|
427
|
+
|
428
|
+
Use this for incremental updates or getting all current locations.
|
289
429
|
|
290
430
|
Args:
|
291
431
|
since_year (int, optional): Filter for locations added/modified since this year.
|
@@ -293,6 +433,8 @@ class OSMLocationFetcher:
|
|
293
433
|
- 'separate': Create separate entries for each category (default)
|
294
434
|
- 'combine': Use a single entry with a list of matching categories
|
295
435
|
- 'primary': Keep only the first matching category
|
436
|
+
include_metadata: If True, include change tracking metadata
|
437
|
+
(timestamp, version, changeset, user, uid)
|
296
438
|
|
297
439
|
Returns:
|
298
440
|
pd.DataFrame: Processed OSM locations
|
@@ -306,10 +448,118 @@ class OSMLocationFetcher:
|
|
306
448
|
f"Fetching OSM locations from Overpass API for country: {self.country}"
|
307
449
|
)
|
308
450
|
self.logger.info(f"Location types: {self.location_types}")
|
309
|
-
self.logger.info(f"Handling duplicate category matches as: {handle_duplicates}")
|
310
451
|
|
311
|
-
#
|
312
|
-
|
452
|
+
# Normalize date if provided
|
453
|
+
since_str = self._normalize_date(since_date) if since_date else None
|
454
|
+
|
455
|
+
if since_str:
|
456
|
+
self.logger.info(f"Filtering for changes since: {since_str}")
|
457
|
+
|
458
|
+
queries = self._build_queries(
|
459
|
+
date_filter_type="newer" if since_str else None,
|
460
|
+
start_date=since_str,
|
461
|
+
include_metadata=include_metadata,
|
462
|
+
)
|
463
|
+
|
464
|
+
return self._execute_and_process_queries(queries, handle_duplicates)
|
465
|
+
|
466
|
+
def fetch_locations_changed_between(
|
467
|
+
self,
|
468
|
+
start_date: Union[str, datetime],
|
469
|
+
end_date: Union[str, datetime],
|
470
|
+
handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
|
471
|
+
include_metadata: bool = True,
|
472
|
+
) -> pd.DataFrame:
|
473
|
+
"""
|
474
|
+
Fetch OSM locations that changed within a specific date range.
|
475
|
+
|
476
|
+
Use this for historical analysis or tracking changes in a specific period.
|
477
|
+
|
478
|
+
Args:
|
479
|
+
start_date: Start date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
|
480
|
+
or datetime object. Changes after this date will be included.
|
481
|
+
end_date: End date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
|
482
|
+
or datetime object. Changes before this date will be included.
|
483
|
+
handle_duplicates: How to handle objects matching multiple categories:
|
484
|
+
- 'separate': Create separate entries for each category (default)
|
485
|
+
- 'combine': Use a single entry with a list of matching categories
|
486
|
+
- 'primary': Keep only the first matching category
|
487
|
+
include_metadata: If True, include change tracking metadata
|
488
|
+
(timestamp, version, changeset, user, uid)
|
489
|
+
Defaults to True since change tracking is the main use case.
|
490
|
+
|
491
|
+
Returns:
|
492
|
+
pd.DataFrame: Processed OSM locations that changed within the date range
|
493
|
+
|
494
|
+
Raises:
|
495
|
+
ValueError: If dates are invalid or start_date is after end_date
|
496
|
+
"""
|
497
|
+
start_str = self._normalize_date(start_date)
|
498
|
+
end_str = self._normalize_date(end_date)
|
499
|
+
|
500
|
+
if start_str >= end_str:
|
501
|
+
raise ValueError(
|
502
|
+
f"start_date must be before end_date (got {start_str} >= {end_str})"
|
503
|
+
)
|
504
|
+
|
505
|
+
queries = self._build_queries(
|
506
|
+
date_filter_type="changed",
|
507
|
+
start_date=start_str,
|
508
|
+
end_date=end_str,
|
509
|
+
include_metadata=include_metadata,
|
510
|
+
)
|
511
|
+
|
512
|
+
return self._execute_and_process_queries(queries, handle_duplicates)
|
513
|
+
|
514
|
+
def _normalize_date(self, date_input: Union[str, datetime]) -> str:
|
515
|
+
"""
|
516
|
+
Convert date input to ISO 8601 format string.
|
517
|
+
|
518
|
+
Args:
|
519
|
+
date_input: Either a string in ISO 8601 format or a datetime object
|
520
|
+
|
521
|
+
Returns:
|
522
|
+
str: Date in format "YYYY-MM-DDThh:mm:ssZ"
|
523
|
+
|
524
|
+
Raises:
|
525
|
+
ValueError: If string format is invalid
|
526
|
+
"""
|
527
|
+
from datetime import datetime
|
528
|
+
|
529
|
+
if isinstance(date_input, datetime):
|
530
|
+
# Convert datetime to ISO 8601 string with Z (UTC) timezone
|
531
|
+
return date_input.strftime("%Y-%m-%dT%H:%M:%SZ")
|
532
|
+
|
533
|
+
elif isinstance(date_input, str):
|
534
|
+
# Validate the string format
|
535
|
+
try:
|
536
|
+
# Try to parse it to ensure it's valid
|
537
|
+
datetime.strptime(date_input, "%Y-%m-%dT%H:%M:%SZ")
|
538
|
+
return date_input
|
539
|
+
except ValueError:
|
540
|
+
raise ValueError(
|
541
|
+
f"Invalid date format: '{date_input}'. "
|
542
|
+
"Expected format: 'YYYY-MM-DDThh:mm:ssZ' (e.g., '2024-03-15T14:30:00Z')"
|
543
|
+
)
|
544
|
+
else:
|
545
|
+
raise TypeError(
|
546
|
+
f"date_input must be str or datetime, got {type(date_input).__name__}"
|
547
|
+
)
|
548
|
+
|
549
|
+
def _execute_and_process_queries(
|
550
|
+
self, queries: List[str], handle_duplicates: str
|
551
|
+
) -> pd.DataFrame:
|
552
|
+
"""
|
553
|
+
Execute queries and process results (extracted from fetch_locations).
|
554
|
+
|
555
|
+
Args:
|
556
|
+
queries: List of [nodes_relations_query, ways_query]
|
557
|
+
handle_duplicates: Strategy for handling duplicate categories
|
558
|
+
|
559
|
+
Returns:
|
560
|
+
pd.DataFrame: Processed locations
|
561
|
+
"""
|
562
|
+
nodes_relations_query, ways_query = queries
|
313
563
|
|
314
564
|
# Fetch nodes and relations
|
315
565
|
nodes_relations_response = self._make_request(nodes_relations_query)
|
@@ -352,16 +602,14 @@ class OSMLocationFetcher:
|
|
352
602
|
self.logger.warning("No matching elements found after processing")
|
353
603
|
return pd.DataFrame()
|
354
604
|
|
355
|
-
# Handle duplicates
|
605
|
+
# Handle duplicates (reuse existing logic from fetch_locations)
|
356
606
|
if handle_duplicates != "separate":
|
357
|
-
# Group by source_id
|
358
607
|
grouped_elements = {}
|
359
608
|
for elem in all_elements:
|
360
609
|
source_id = elem["source_id"]
|
361
610
|
if source_id not in grouped_elements:
|
362
611
|
grouped_elements[source_id] = elem
|
363
612
|
elif handle_duplicates == "combine":
|
364
|
-
# Combine matching categories
|
365
613
|
if grouped_elements[source_id]["category"] != elem["category"]:
|
366
614
|
if isinstance(grouped_elements[source_id]["category"], str):
|
367
615
|
grouped_elements[source_id]["category"] = [
|
@@ -381,44 +629,16 @@ class OSMLocationFetcher:
|
|
381
629
|
grouped_elements[source_id]["category_value"].append(
|
382
630
|
elem["category_value"]
|
383
631
|
)
|
384
|
-
# For 'primary', just keep the first one we encountered
|
385
632
|
|
386
633
|
all_elements = list(grouped_elements.values())
|
387
634
|
|
388
635
|
locations = pd.DataFrame(all_elements)
|
389
636
|
|
390
|
-
# Log
|
637
|
+
# Log statistics
|
391
638
|
type_counts = locations["type"].value_counts()
|
392
639
|
self.logger.info("\nElement type distribution:")
|
393
640
|
for element_type, count in type_counts.items():
|
394
641
|
self.logger.info(f"{element_type}: {count}")
|
395
642
|
|
396
|
-
# Log category distribution
|
397
|
-
if handle_duplicates == "combine":
|
398
|
-
# Count each category separately when they're in lists
|
399
|
-
category_counts = {}
|
400
|
-
for cats in locations["category"]:
|
401
|
-
if isinstance(cats, list):
|
402
|
-
for cat in cats:
|
403
|
-
category_counts[cat] = category_counts.get(cat, 0) + 1
|
404
|
-
else:
|
405
|
-
category_counts[cats] = category_counts.get(cats, 0) + 1
|
406
|
-
|
407
|
-
self.logger.info("\nCategory distribution:")
|
408
|
-
for category, count in category_counts.items():
|
409
|
-
self.logger.info(f"{category}: {count}")
|
410
|
-
else:
|
411
|
-
category_counts = locations["category"].value_counts()
|
412
|
-
self.logger.info("\nCategory distribution:")
|
413
|
-
for category, count in category_counts.items():
|
414
|
-
self.logger.info(f"{category}: {count}")
|
415
|
-
|
416
|
-
# Log elements with multiple matching categories
|
417
|
-
multi_category = [e for e in all_elements if len(e["matching_categories"]) > 1]
|
418
|
-
if multi_category:
|
419
|
-
self.logger.info(
|
420
|
-
f"\n{len(multi_category)} elements matched multiple categories"
|
421
|
-
)
|
422
|
-
|
423
643
|
self.logger.info(f"Successfully processed {len(locations)} locations")
|
424
644
|
return locations
|