giga-spatial 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor
8
8
  from requests.exceptions import RequestException
9
9
  from shapely.geometry import Polygon, Point
10
10
  import pycountry
11
+ from datetime import datetime
11
12
 
12
13
  from gigaspatial.config import config
13
14
 
@@ -112,56 +113,100 @@ class OSMLocationFetcher:
112
113
  names.append(name)
113
114
  return sorted(set(names))
114
115
 
115
- def _build_queries(self, since_year: Optional[int] = None) -> List[str]:
116
- """
117
- Construct separate Overpass QL queries for different element types and categories.
118
- Returns list of [nodes_relations_query, ways_query]
116
+ @staticmethod
117
+ def get_osm_countries(
118
+ iso3_code: Optional[str] = None, include_names: bool = True, timeout: int = 1000
119
+ ) -> Union[str, Dict[str, str], List[str], List[Dict[str, str]]]:
119
120
  """
120
- if since_year:
121
- date_filter = f'(newer:"{since_year}-01-01T00:00:00Z")'
122
- else:
123
- date_filter = ""
121
+ Fetch countries from OpenStreetMap database.
124
122
 
125
- # Query for nodes and relations (with center output)
126
- nodes_relations_queries = []
127
- for category, types in self.location_types.items():
128
- nodes_relations_queries.extend(
129
- [
130
- f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
131
- f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
132
- ]
133
- )
123
+ This queries the actual OSM database for country boundaries and returns
124
+ country names as they appear in OSM, including various name translations.
134
125
 
135
- nodes_relations_queries = "\n".join(nodes_relations_queries)
126
+ Args:
127
+ iso3_code (str, optional): ISO 3166-1 alpha-3 code to fetch a specific country.
128
+ If provided, returns single country data.
129
+ If None, returns all countries.
130
+ include_names (bool): If True, return dict with multiple name variants.
131
+ If False, return only the primary name.
132
+ timeout (int): Timeout for the Overpass API request (default: 1000).
136
133
 
137
- nodes_relations_query = f"""
138
- [out:json][timeout:{self.timeout}];
139
- {self.area_query}
134
+ Returns:
135
+ When iso3_code is provided:
136
+ - If include_names=False: Single country name (str)
137
+ - If include_names=True: Dict with name variants
138
+ When iso3_code is None:
139
+ - If include_names=False: List of country names
140
+ - If include_names=True: List of dicts with name variants including:
141
+ name, name:en, ISO3166-1 codes, and other name translations
142
+
143
+ Raises:
144
+ ValueError: If iso3_code is provided but country not found in OSM.
145
+ """
146
+ if iso3_code:
147
+ # Filter for the specific ISO3 code provided
148
+ iso3_upper = iso3_code.upper()
149
+ country_filter = f'["ISO3166-1:alpha3"="{iso3_upper}"]'
150
+ else:
151
+ # Filter for the *existence* of an ISO3 code tag to limit results to actual countries
152
+ country_filter = '["ISO3166-1:alpha3"]'
153
+
154
+ # Query OSM for country-level boundaries
155
+ query = f"""
156
+ [out:json][timeout:{timeout}];
140
157
  (
141
- {nodes_relations_queries}
158
+ relation["boundary"="administrative"]["admin_level"="2"]{country_filter};
142
159
  );
143
- out center;
160
+ out tags;
144
161
  """
145
162
 
146
- # Query for ways (with geometry output)
147
- ways_queries = []
148
- for category, types in self.location_types.items():
149
- ways_queries.append(
150
- f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
151
- )
163
+ url = "http://overpass-api.de/api/interpreter"
164
+ response = requests.get(url, params={"data": query}, timeout=timeout)
165
+ response.raise_for_status()
166
+ data = response.json()
152
167
 
153
- ways_queries = "\n".join(ways_queries)
168
+ countries = []
169
+ for element in data.get("elements", []):
170
+ tags = element.get("tags", {})
154
171
 
155
- ways_query = f"""
156
- [out:json][timeout:{self.timeout}];
157
- {self.area_query}
158
- (
159
- {ways_queries}
160
- );
161
- out geom;
162
- """
172
+ if include_names:
173
+ country_info = {
174
+ "name": tags.get("name", ""),
175
+ "name:en": tags.get("name:en", ""),
176
+ "official_name": tags.get("official_name", ""),
177
+ "official_name:en": tags.get("official_name:en", ""),
178
+ "ISO3166-1": tags.get("ISO3166-1", ""),
179
+ "ISO3166-1:alpha2": tags.get("ISO3166-1:alpha2", ""),
180
+ "ISO3166-1:alpha3": tags.get("ISO3166-1:alpha3", ""),
181
+ }
182
+
183
+ # Add any other name:* tags (translations)
184
+ for key, value in tags.items():
185
+ if key.startswith("name:") and key not in country_info:
186
+ country_info[key] = value
187
+
188
+ # Remove empty string values
189
+ country_info = {k: v for k, v in country_info.items() if v}
190
+
191
+ if country_info.get("name"): # Only add if has a name
192
+ countries.append(country_info)
193
+ else:
194
+ name = tags.get("name")
195
+ if name:
196
+ countries.append(name)
197
+
198
+ # If looking for a specific country, return single result or raise error
199
+ if iso3_code:
200
+ if not countries:
201
+ raise ValueError(
202
+ f"Country with ISO3 code '{iso3_code}' not found in OSM database"
203
+ )
204
+ return countries[0] # Return single country, not a list
163
205
 
164
- return [nodes_relations_query, ways_query]
206
+ # Return sorted list for all countries
207
+ return sorted(
208
+ countries, key=lambda x: x if isinstance(x, str) else x.get("name", "")
209
+ )
165
210
 
166
211
  def _make_request(self, query: str) -> Dict:
167
212
  """Make HTTP request to Overpass API with retry mechanism."""
@@ -213,23 +258,33 @@ class OSMLocationFetcher:
213
258
  _lon = element.get("lon") or element["center"]["lon"]
214
259
  point_geom = Point(_lon, _lat)
215
260
 
216
- # for each matching category, create a separate element
261
+ # Extract metadata if available
262
+ metadata = {}
263
+ if "timestamp" in element:
264
+ metadata["timestamp"] = element["timestamp"]
265
+ metadata["version"] = element.get("version")
266
+ metadata["changeset"] = element.get("changeset")
267
+ metadata["user"] = element.get("user")
268
+ metadata["uid"] = element.get("uid")
269
+
270
+ # For each matching category, create a separate element
217
271
  results = []
218
272
  for category, value in matching_categories.items():
219
- results.append(
220
- {
221
- "source_id": element["id"],
222
- "category": category,
223
- "category_value": value,
224
- "name": tags.get("name", ""),
225
- "name_en": tags.get("name:en", ""),
226
- "type": element["type"],
227
- "geometry": point_geom,
228
- "latitude": _lat,
229
- "longitude": _lon,
230
- "matching_categories": list(matching_categories.keys()),
231
- }
232
- )
273
+ result = {
274
+ "source_id": element["id"],
275
+ "category": category,
276
+ "category_value": value,
277
+ "name": tags.get("name", ""),
278
+ "name_en": tags.get("name:en", ""),
279
+ "type": element["type"],
280
+ "geometry": point_geom,
281
+ "latitude": _lat,
282
+ "longitude": _lon,
283
+ "matching_categories": list(matching_categories.keys()),
284
+ }
285
+ # Add metadata if available
286
+ result.update(metadata)
287
+ results.append(result)
233
288
 
234
289
  return results
235
290
 
@@ -256,36 +311,121 @@ class OSMLocationFetcher:
256
311
  polygon = Polygon([(p["lon"], p["lat"]) for p in element["geometry"]])
257
312
  centroid = polygon.centroid
258
313
 
314
+ # Extract metadata if available
315
+ metadata = {}
316
+ if "timestamp" in element:
317
+ metadata["timestamp"] = element["timestamp"]
318
+ metadata["version"] = element.get("version")
319
+ metadata["changeset"] = element.get("changeset")
320
+ metadata["user"] = element.get("user")
321
+ metadata["uid"] = element.get("uid")
322
+
259
323
  # For each matching category, create a separate element
260
324
  results = []
261
325
  for category, value in matching_categories.items():
262
- results.append(
263
- {
264
- "source_id": element["id"],
265
- "category": category,
266
- "category_value": value,
267
- "name": tags.get("name", ""),
268
- "name_en": tags.get("name:en", ""),
269
- "type": element["type"],
270
- "geometry": polygon,
271
- "latitude": centroid.y,
272
- "longitude": centroid.x,
273
- "matching_categories": list(matching_categories.keys()),
274
- }
275
- )
326
+ result = {
327
+ "source_id": element["id"],
328
+ "category": category,
329
+ "category_value": value,
330
+ "name": tags.get("name", ""),
331
+ "name_en": tags.get("name:en", ""),
332
+ "type": element["type"],
333
+ "geometry": polygon,
334
+ "latitude": centroid.y,
335
+ "longitude": centroid.x,
336
+ "matching_categories": list(matching_categories.keys()),
337
+ }
338
+ # Add metadata if available
339
+ result.update(metadata)
340
+ results.append(result)
276
341
 
277
342
  return results
278
343
  except (KeyError, ValueError) as e:
279
344
  self.logger.error(f"Error processing way geometry: {str(e)}")
280
345
  return []
281
346
 
347
+ def _build_queries(
348
+ self,
349
+ date_filter_type: Optional[Literal["newer", "changed"]] = None,
350
+ start_date: Optional[str] = None,
351
+ end_date: Optional[str] = None,
352
+ include_metadata: bool = False,
353
+ ) -> List[str]:
354
+ """
355
+ Construct Overpass QL queries with optional date filtering and metadata.
356
+
357
+ Args:
358
+ date_filter_type: Type of date filter ('newer' or 'changed')
359
+ start_date: Start date in ISO 8601 format
360
+ end_date: End date in ISO 8601 format (required for 'changed')
361
+ include_metadata: If True, include change metadata (timestamp, version, changeset, user)
362
+
363
+ Returns:
364
+ List[str]: List of [nodes_relations_query, ways_query]
365
+ """
366
+ # Build the date filter based on type
367
+ if date_filter_type == "newer" and start_date:
368
+ date_filter = f'(newer:"{start_date}")'
369
+ elif date_filter_type == "changed" and start_date and end_date:
370
+ date_filter = f'(changed:"{start_date}","{end_date}")'
371
+ else:
372
+ date_filter = ""
373
+
374
+ # Determine output mode
375
+ output_mode = "center meta" if include_metadata else "center"
376
+ output_mode_geom = "geom meta" if include_metadata else "geom"
377
+
378
+ # Query for nodes and relations
379
+ nodes_relations_queries = []
380
+ for category, types in self.location_types.items():
381
+ nodes_relations_queries.extend(
382
+ [
383
+ f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
384
+ f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
385
+ ]
386
+ )
387
+
388
+ nodes_relations_queries = "\n".join(nodes_relations_queries)
389
+
390
+ nodes_relations_query = f"""
391
+ [out:json][timeout:{self.timeout}];
392
+ {self.area_query}
393
+ (
394
+ {nodes_relations_queries}
395
+ );
396
+ out {output_mode};
397
+ """
398
+
399
+ # Query for ways
400
+ ways_queries = []
401
+ for category, types in self.location_types.items():
402
+ ways_queries.append(
403
+ f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
404
+ )
405
+
406
+ ways_queries = "\n".join(ways_queries)
407
+
408
+ ways_query = f"""
409
+ [out:json][timeout:{self.timeout}];
410
+ {self.area_query}
411
+ (
412
+ {ways_queries}
413
+ );
414
+ out {output_mode_geom};
415
+ """
416
+
417
+ return [nodes_relations_query, ways_query]
418
+
282
419
  def fetch_locations(
283
420
  self,
284
- since_year: Optional[int] = None,
421
+ since_date: Optional[Union[str, datetime]] = None,
285
422
  handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
423
+ include_metadata: bool = False,
286
424
  ) -> pd.DataFrame:
287
425
  """
288
- Fetch and process OSM locations.
426
+ Fetch OSM locations, optionally filtered by 'since' date.
427
+
428
+ Use this for incremental updates or getting all current locations.
289
429
 
290
430
  Args:
291
431
  since_year (int, optional): Filter for locations added/modified since this year.
@@ -293,6 +433,8 @@ class OSMLocationFetcher:
293
433
  - 'separate': Create separate entries for each category (default)
294
434
  - 'combine': Use a single entry with a list of matching categories
295
435
  - 'primary': Keep only the first matching category
436
+ include_metadata: If True, include change tracking metadata
437
+ (timestamp, version, changeset, user, uid)
296
438
 
297
439
  Returns:
298
440
  pd.DataFrame: Processed OSM locations
@@ -306,10 +448,118 @@ class OSMLocationFetcher:
306
448
  f"Fetching OSM locations from Overpass API for country: {self.country}"
307
449
  )
308
450
  self.logger.info(f"Location types: {self.location_types}")
309
- self.logger.info(f"Handling duplicate category matches as: {handle_duplicates}")
310
451
 
311
- # Get queries for different element types
312
- nodes_relations_query, ways_query = self._build_queries(since_year)
452
+ # Normalize date if provided
453
+ since_str = self._normalize_date(since_date) if since_date else None
454
+
455
+ if since_str:
456
+ self.logger.info(f"Filtering for changes since: {since_str}")
457
+
458
+ queries = self._build_queries(
459
+ date_filter_type="newer" if since_str else None,
460
+ start_date=since_str,
461
+ include_metadata=include_metadata,
462
+ )
463
+
464
+ return self._execute_and_process_queries(queries, handle_duplicates)
465
+
466
+ def fetch_locations_changed_between(
467
+ self,
468
+ start_date: Union[str, datetime],
469
+ end_date: Union[str, datetime],
470
+ handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
471
+ include_metadata: bool = True,
472
+ ) -> pd.DataFrame:
473
+ """
474
+ Fetch OSM locations that changed within a specific date range.
475
+
476
+ Use this for historical analysis or tracking changes in a specific period.
477
+
478
+ Args:
479
+ start_date: Start date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
480
+ or datetime object. Changes after this date will be included.
481
+ end_date: End date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
482
+ or datetime object. Changes before this date will be included.
483
+ handle_duplicates: How to handle objects matching multiple categories:
484
+ - 'separate': Create separate entries for each category (default)
485
+ - 'combine': Use a single entry with a list of matching categories
486
+ - 'primary': Keep only the first matching category
487
+ include_metadata: If True, include change tracking metadata
488
+ (timestamp, version, changeset, user, uid)
489
+ Defaults to True since change tracking is the main use case.
490
+
491
+ Returns:
492
+ pd.DataFrame: Processed OSM locations that changed within the date range
493
+
494
+ Raises:
495
+ ValueError: If dates are invalid or start_date is after end_date
496
+ """
497
+ start_str = self._normalize_date(start_date)
498
+ end_str = self._normalize_date(end_date)
499
+
500
+ if start_str >= end_str:
501
+ raise ValueError(
502
+ f"start_date must be before end_date (got {start_str} >= {end_str})"
503
+ )
504
+
505
+ queries = self._build_queries(
506
+ date_filter_type="changed",
507
+ start_date=start_str,
508
+ end_date=end_str,
509
+ include_metadata=include_metadata,
510
+ )
511
+
512
+ return self._execute_and_process_queries(queries, handle_duplicates)
513
+
514
+ def _normalize_date(self, date_input: Union[str, datetime]) -> str:
515
+ """
516
+ Convert date input to ISO 8601 format string.
517
+
518
+ Args:
519
+ date_input: Either a string in ISO 8601 format or a datetime object
520
+
521
+ Returns:
522
+ str: Date in format "YYYY-MM-DDThh:mm:ssZ"
523
+
524
+ Raises:
525
+ ValueError: If string format is invalid
526
+ """
527
+ from datetime import datetime
528
+
529
+ if isinstance(date_input, datetime):
530
+ # Convert datetime to ISO 8601 string with Z (UTC) timezone
531
+ return date_input.strftime("%Y-%m-%dT%H:%M:%SZ")
532
+
533
+ elif isinstance(date_input, str):
534
+ # Validate the string format
535
+ try:
536
+ # Try to parse it to ensure it's valid
537
+ datetime.strptime(date_input, "%Y-%m-%dT%H:%M:%SZ")
538
+ return date_input
539
+ except ValueError:
540
+ raise ValueError(
541
+ f"Invalid date format: '{date_input}'. "
542
+ "Expected format: 'YYYY-MM-DDThh:mm:ssZ' (e.g., '2024-03-15T14:30:00Z')"
543
+ )
544
+ else:
545
+ raise TypeError(
546
+ f"date_input must be str or datetime, got {type(date_input).__name__}"
547
+ )
548
+
549
+ def _execute_and_process_queries(
550
+ self, queries: List[str], handle_duplicates: str
551
+ ) -> pd.DataFrame:
552
+ """
553
+ Execute queries and process results (extracted from fetch_locations).
554
+
555
+ Args:
556
+ queries: List of [nodes_relations_query, ways_query]
557
+ handle_duplicates: Strategy for handling duplicate categories
558
+
559
+ Returns:
560
+ pd.DataFrame: Processed locations
561
+ """
562
+ nodes_relations_query, ways_query = queries
313
563
 
314
564
  # Fetch nodes and relations
315
565
  nodes_relations_response = self._make_request(nodes_relations_query)
@@ -352,16 +602,14 @@ class OSMLocationFetcher:
352
602
  self.logger.warning("No matching elements found after processing")
353
603
  return pd.DataFrame()
354
604
 
355
- # Handle duplicates based on the specified strategy
605
+ # Handle duplicates (reuse existing logic from fetch_locations)
356
606
  if handle_duplicates != "separate":
357
- # Group by source_id
358
607
  grouped_elements = {}
359
608
  for elem in all_elements:
360
609
  source_id = elem["source_id"]
361
610
  if source_id not in grouped_elements:
362
611
  grouped_elements[source_id] = elem
363
612
  elif handle_duplicates == "combine":
364
- # Combine matching categories
365
613
  if grouped_elements[source_id]["category"] != elem["category"]:
366
614
  if isinstance(grouped_elements[source_id]["category"], str):
367
615
  grouped_elements[source_id]["category"] = [
@@ -381,44 +629,16 @@ class OSMLocationFetcher:
381
629
  grouped_elements[source_id]["category_value"].append(
382
630
  elem["category_value"]
383
631
  )
384
- # For 'primary', just keep the first one we encountered
385
632
 
386
633
  all_elements = list(grouped_elements.values())
387
634
 
388
635
  locations = pd.DataFrame(all_elements)
389
636
 
390
- # Log element type distribution
637
+ # Log statistics
391
638
  type_counts = locations["type"].value_counts()
392
639
  self.logger.info("\nElement type distribution:")
393
640
  for element_type, count in type_counts.items():
394
641
  self.logger.info(f"{element_type}: {count}")
395
642
 
396
- # Log category distribution
397
- if handle_duplicates == "combine":
398
- # Count each category separately when they're in lists
399
- category_counts = {}
400
- for cats in locations["category"]:
401
- if isinstance(cats, list):
402
- for cat in cats:
403
- category_counts[cat] = category_counts.get(cat, 0) + 1
404
- else:
405
- category_counts[cats] = category_counts.get(cats, 0) + 1
406
-
407
- self.logger.info("\nCategory distribution:")
408
- for category, count in category_counts.items():
409
- self.logger.info(f"{category}: {count}")
410
- else:
411
- category_counts = locations["category"].value_counts()
412
- self.logger.info("\nCategory distribution:")
413
- for category, count in category_counts.items():
414
- self.logger.info(f"{category}: {count}")
415
-
416
- # Log elements with multiple matching categories
417
- multi_category = [e for e in all_elements if len(e["matching_categories"]) > 1]
418
- if multi_category:
419
- self.logger.info(
420
- f"\n{len(multi_category)} elements matched multiple categories"
421
- )
422
-
423
643
  self.logger.info(f"Successfully processed {len(locations)} locations")
424
644
  return locations