vfbquery 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@ from datetime import datetime, timedelta
18
18
  from typing import Dict, Any, Optional, List
19
19
  import logging
20
20
  from dataclasses import dataclass, asdict
21
+ import pandas as pd
21
22
  from vfbquery.term_info_queries import NumpyEncoder
22
23
 
23
24
  logger = logging.getLogger(__name__)
@@ -95,11 +96,12 @@ class SolrResultCache:
95
96
  Cached result or None if not found/expired
96
97
  """
97
98
  try:
98
- # Query for cache document with prefixed ID
99
- cache_doc_id = f"vfb_query_{term_id}"
99
+ # Query for cache document with prefixed ID including query type
100
+ # This ensures different query types for the same term have separate cache entries
101
+ cache_doc_id = f"vfb_query_{query_type}_{term_id}"
100
102
 
101
103
  response = requests.get(f"{self.cache_url}/select", params={
102
- "q": f"id:{cache_doc_id} AND query_type:{query_type}",
104
+ "q": f"id:{cache_doc_id}",
103
105
  "fl": "cache_data",
104
106
  "wt": "json"
105
107
  }, timeout=5) # Short timeout for cache lookups
@@ -161,6 +163,14 @@ class SolrResultCache:
161
163
  logger.warning(f"Failed to parse cached result for {term_id}")
162
164
  return None
163
165
 
166
+ # IMPORTANT: Validate cached result - reject error results (count=-1)
167
+ # This ensures old cached errors get retried when the service is working again
168
+ if isinstance(result, dict) and 'count' in result:
169
+ if result.get('count', -1) < 0:
170
+ logger.warning(f"Rejecting cached error result for {query_type}({term_id}): count={result.get('count')}")
171
+ self._clear_expired_cache_document(cache_doc_id)
172
+ return None
173
+
164
174
  logger.info(f"Cache hit for {query_type}({term_id})")
165
175
  return result
166
176
 
@@ -194,8 +204,9 @@ class SolrResultCache:
194
204
  if not cached_data:
195
205
  return False # Result too large or other issue
196
206
 
197
- # Create cache document with prefixed ID
198
- cache_doc_id = f"vfb_query_{term_id}"
207
+ # Create cache document with prefixed ID including query type
208
+ # This ensures different query types for the same term have separate cache entries
209
+ cache_doc_id = f"vfb_query_{query_type}_{term_id}"
199
210
 
200
211
  cache_doc = {
201
212
  "id": cache_doc_id,
@@ -240,6 +251,37 @@ class SolrResultCache:
240
251
  except Exception as e:
241
252
  logger.debug(f"Failed to clear expired cache document: {e}")
242
253
 
254
+ def clear_cache_entry(self, query_type: str, term_id: str) -> bool:
255
+ """
256
+ Manually clear a specific cache entry to force refresh
257
+
258
+ Args:
259
+ query_type: Type of query ('term_info', 'instances', etc.)
260
+ term_id: Term identifier
261
+
262
+ Returns:
263
+ True if successfully cleared, False otherwise
264
+ """
265
+ try:
266
+ # Include query_type in cache document ID to match storage format
267
+ cache_doc_id = f"vfb_query_{query_type}_{term_id}"
268
+ response = requests.post(
269
+ f"{self.cache_url}/update",
270
+ data=f'<delete><id>{cache_doc_id}</id></delete>',
271
+ headers={"Content-Type": "application/xml"},
272
+ params={"commit": "true"}, # Commit immediately to ensure it's cleared
273
+ timeout=5
274
+ )
275
+ if response.status_code == 200:
276
+ logger.info(f"Cleared cache entry for {query_type}({term_id})")
277
+ return True
278
+ else:
279
+ logger.error(f"Failed to clear cache entry: HTTP {response.status_code}")
280
+ return False
281
+ except Exception as e:
282
+ logger.error(f"Error clearing cache entry: {e}")
283
+ return False
284
+
243
285
  def _increment_cache_hit_count(self, cache_doc_id: str, current_count: int):
244
286
  """Increment hit count for cache document (background operation)"""
245
287
  try:
@@ -269,10 +311,11 @@ class SolrResultCache:
269
311
  Dictionary with cache age info or None if not cached
270
312
  """
271
313
  try:
272
- cache_doc_id = f"vfb_query_{term_id}"
314
+ # Include query_type in cache document ID to match storage format
315
+ cache_doc_id = f"vfb_query_{query_type}_{term_id}"
273
316
 
274
317
  response = requests.get(f"{self.cache_url}/select", params={
275
- "q": f"id:{cache_doc_id} AND query_type:{query_type}",
318
+ "q": f"id:{cache_doc_id}",
276
319
  "fl": "cache_data,hit_count,last_accessed",
277
320
  "wt": "json"
278
321
  }, timeout=5)
@@ -533,79 +576,221 @@ def with_solr_cache(query_type: str):
533
576
 
534
577
  Usage:
535
578
  @with_solr_cache('term_info')
536
- def get_term_info(short_form, **kwargs):
579
+ def get_term_info(short_form, force_refresh=False, **kwargs):
537
580
  # ... existing implementation
581
+
582
+ The decorated function can accept a 'force_refresh' parameter to bypass cache.
538
583
  """
539
584
  def decorator(func):
540
585
  def wrapper(*args, **kwargs):
586
+ # Check if force_refresh is requested (pop it before passing to function)
587
+ force_refresh = kwargs.pop('force_refresh', False)
588
+
589
+ # Check if limit is applied - don't cache limited results as they're incomplete
590
+ limit = kwargs.get('limit', -1)
591
+ should_cache = (limit == -1) # Only cache when getting all results (limit=-1)
592
+
593
+ # For neuron_neuron_connectivity_query, only cache when all parameters are defaults
594
+ if query_type == 'neuron_neuron_connectivity_query':
595
+ min_weight = kwargs.get('min_weight', 0)
596
+ direction = kwargs.get('direction', 'both')
597
+ should_cache = should_cache and (min_weight == 0) and (direction == 'both')
598
+
541
599
  # Extract term_id from first argument or kwargs
542
600
  term_id = args[0] if args else kwargs.get('short_form') or kwargs.get('term_id')
543
601
 
602
+ # For functions like get_templates that don't have a term_id, use query_type as cache key
544
603
  if not term_id:
545
- logger.warning("No term_id found for caching")
546
- return func(*args, **kwargs)
604
+ if query_type == 'templates':
605
+ # Use a fixed cache key for templates since it doesn't take a term_id
606
+ term_id = 'all_templates'
607
+ else:
608
+ logger.warning(f"No term_id found for caching {query_type}")
609
+ return func(*args, **kwargs)
610
+
611
+ # Include preview parameter in cache key for term_info queries
612
+ # This ensures preview=True and preview=False have separate cache entries
613
+ cache_term_id = term_id
614
+ if query_type == 'term_info':
615
+ preview = kwargs.get('preview', True) # Default is True
616
+ cache_term_id = f"{term_id}_preview_{preview}"
617
+
618
+ # Include return_dataframe parameter in cache key for queries that support it
619
+ # This ensures DataFrame and dict formats are cached separately
620
+ if query_type in ['instances', 'neurons_part_here', 'neurons_synaptic',
621
+ 'neurons_presynaptic', 'neurons_postsynaptic',
622
+ 'components_of', 'parts_of', 'subclasses_of',
623
+ 'neuron_classes_fasciculating_here', 'tracts_nerves_innervating_here',
624
+ 'lineage_clones_in', 'images_neurons', 'images_that_develop_from',
625
+ 'expression_pattern_fragments', 'neuron_neuron_connectivity_query']:
626
+ return_dataframe = kwargs.get('return_dataframe', True) # Default is True
627
+ cache_term_id = f"{cache_term_id}_df_{return_dataframe}"
547
628
 
548
629
  cache = get_solr_cache()
549
630
 
550
- # Try cache first
551
- cached_result = cache.get_cached_result(query_type, term_id, **kwargs)
552
- if cached_result is not None:
553
- # Validate that cached result has essential fields for term_info
554
- if query_type == 'term_info':
555
- is_valid = (cached_result and isinstance(cached_result, dict) and
556
- cached_result.get('Id') and cached_result.get('Name'))
557
-
558
- # Additional validation for query results
559
- if is_valid and 'Queries' in cached_result:
560
- logger.debug(f"Validating {len(cached_result['Queries'])} queries for {term_id}")
561
- for i, query in enumerate(cached_result['Queries']):
562
- count = query.get('count', 0)
563
- preview_results = query.get('preview_results')
564
- headers = preview_results.get('headers', []) if isinstance(preview_results, dict) else []
565
-
566
- logger.debug(f"Query {i}: count={count}, preview_results_type={type(preview_results)}, headers={headers}")
567
-
568
- # Check if query has unrealistic count (0 or -1) which indicates failed execution
569
- if count <= 0:
570
- is_valid = False
571
- logger.debug(f"Cached result has invalid query count {count} for {term_id}")
572
- break
573
- # Check if preview_results is missing or has empty headers when it should have data
574
- if not isinstance(preview_results, dict) or not headers:
575
- is_valid = False
576
- logger.debug(f"Cached result has invalid preview_results structure for {term_id}")
577
- break
631
+ # Clear cache if force_refresh is True
632
+ if force_refresh:
633
+ logger.info(f"Force refresh requested for {query_type}({term_id})")
634
+ cache.clear_cache_entry(query_type, cache_term_id)
635
+
636
+ # Try cache first (will be empty if force_refresh was True)
637
+ # OPTIMIZATION: If requesting limited results, check if full results are cached
638
+ # If yes, we can extract the limited rows from the cached full results
639
+ if not force_refresh:
640
+ # First try to get cached result matching the exact query (including limit)
641
+ if should_cache:
642
+ cached_result = cache.get_cached_result(query_type, cache_term_id, **kwargs)
643
+ else:
644
+ # For limited queries, try to get full cached results instead
645
+ full_kwargs = kwargs.copy()
646
+ full_kwargs['limit'] = -1 # Get full results
647
+ cached_result = cache.get_cached_result(query_type, cache_term_id, **full_kwargs)
578
648
 
579
- if is_valid:
580
- logger.debug(f"Using valid cached result for {term_id}")
581
- return cached_result
649
+ # If we got full cached results, extract the limited portion
650
+ if cached_result is not None and limit > 0:
651
+ logger.debug(f"Extracting first {limit} rows from cached full results for {term_id}")
652
+
653
+ # Extract limited rows based on result type
654
+ if isinstance(cached_result, dict) and 'rows' in cached_result:
655
+ cached_result = {
656
+ 'headers': cached_result.get('headers', {}),
657
+ 'rows': cached_result['rows'][:limit],
658
+ 'count': cached_result.get('count', len(cached_result.get('rows', [])))
659
+ }
660
+ elif isinstance(cached_result, pd.DataFrame):
661
+ # Keep the full count but limit the rows
662
+ original_count = len(cached_result)
663
+ cached_result = cached_result.head(limit)
664
+ # Add count attribute if possible
665
+ if hasattr(cached_result, '_metadata'):
666
+ cached_result._metadata['count'] = original_count
667
+
668
+ if cached_result is not None:
669
+ # Validate that cached result has essential fields for term_info
670
+ if query_type == 'term_info':
671
+ is_valid = (cached_result and isinstance(cached_result, dict) and
672
+ cached_result.get('Id') and cached_result.get('Name'))
673
+
674
+ # Additional validation for query results - only when preview=True
675
+ preview = kwargs.get('preview', True) # Default is True
676
+ if is_valid and preview and 'Queries' in cached_result:
677
+ logger.debug(f"Validating {len(cached_result['Queries'])} queries for {term_id}")
678
+ for i, query in enumerate(cached_result['Queries']):
679
+ count = query.get('count', -1) # Default to -1 if missing
680
+ preview_results = query.get('preview_results')
681
+ headers = preview_results.get('headers', []) if isinstance(preview_results, dict) else []
682
+
683
+ logger.debug(f"Query {i}: count={count}, preview_results_type={type(preview_results)}, headers={headers}")
684
+
685
+ # Check if query has error count (-1) which indicates failed execution
686
+ # Note: count of 0 is valid - it means "no matches found"
687
+ if count < 0:
688
+ is_valid = False
689
+ logger.debug(f"Cached result has error query count {count} for {term_id}")
690
+ break
691
+ # Check if preview_results is missing or has empty headers when it should have data
692
+ if not isinstance(preview_results, dict) or not headers:
693
+ is_valid = False
694
+ logger.debug(f"Cached result has invalid preview_results structure for {term_id}")
695
+ break
696
+
697
+ if is_valid:
698
+ logger.debug(f"Using valid cached result for {term_id}")
699
+ return cached_result
700
+ else:
701
+ logger.warning(f"Cached result incomplete for {term_id}, re-executing function")
702
+ # Don't return the incomplete cached result, continue to execute function
582
703
  else:
583
- logger.warning(f"Cached result incomplete for {term_id}, re-executing function")
584
- # Don't return the incomplete cached result, continue to execute function
585
- else:
586
- return cached_result
704
+ return cached_result
587
705
 
588
706
  # Execute function and cache result
589
707
  result = func(*args, **kwargs)
590
708
 
591
709
  # Cache the result asynchronously to avoid blocking
592
- if result:
710
+ # Handle DataFrame, dict, and other result types properly
711
+ result_is_valid = False
712
+ result_is_error = False # Track if result is an error that should clear cache
713
+
714
+ if result is not None:
715
+ if hasattr(result, 'empty'): # DataFrame
716
+ result_is_valid = not result.empty
717
+ elif isinstance(result, dict):
718
+ # For dict results, check if it's not an error result (count != -1)
719
+ # Error results should not be cached
720
+ if 'count' in result:
721
+ count_value = result.get('count', -1)
722
+ result_is_valid = count_value >= 0 # Don't cache errors (count=-1)
723
+ result_is_error = count_value < 0 # Mark as error if count is negative
724
+ else:
725
+ result_is_valid = bool(result) # For dicts without count field
726
+ elif isinstance(result, (list, str)):
727
+ result_is_valid = len(result) > 0
728
+ else:
729
+ result_is_valid = True
730
+
731
+ # If result is an error, actively clear any existing cache entry
732
+ # This ensures that transient failures don't get stuck in cache
733
+ if result_is_error:
734
+ logger.warning(f"Query returned error result for {query_type}({term_id}), clearing cache entry")
735
+ try:
736
+ cache.clear_cache_entry(query_type, cache_term_id)
737
+ except Exception as e:
738
+ logger.debug(f"Failed to clear cache entry: {e}")
739
+
740
+ if result_is_valid:
593
741
  # Validate result before caching for term_info
594
742
  if query_type == 'term_info':
595
- if (result and isinstance(result, dict) and
596
- result.get('Id') and result.get('Name')):
743
+ # Basic validation: must have Id and Name
744
+ is_complete = (result and isinstance(result, dict) and
745
+ result.get('Id') and result.get('Name'))
746
+
747
+ # Additional validation when preview=True: check if queries have results
748
+ # We allow caching even if some queries failed (count=-1) as long as the core term_info is valid
749
+ # This is because some query functions may not be implemented yet or may legitimately fail
750
+ if is_complete:
751
+ preview = kwargs.get('preview', True)
752
+ if preview and 'Queries' in result and result['Queries']:
753
+ # Count how many queries have valid results vs errors
754
+ valid_queries = 0
755
+ failed_queries = 0
756
+
757
+ for query in result['Queries']:
758
+ count = query.get('count', -1)
759
+ preview_results = query.get('preview_results')
760
+
761
+ # Count queries with valid results (count >= 0)
762
+ if count >= 0 and isinstance(preview_results, dict):
763
+ valid_queries += 1
764
+ else:
765
+ failed_queries += 1
766
+
767
+ # Only reject if ALL queries failed - at least one must succeed
768
+ if valid_queries == 0 and failed_queries > 0:
769
+ is_complete = False
770
+ logger.warning(f"Not caching result for {term_id}: all {failed_queries} queries failed")
771
+ elif failed_queries > 0:
772
+ logger.debug(f"Caching result for {term_id} with {valid_queries} valid queries ({failed_queries} failed)")
773
+
774
+ # Only cache if result is complete AND no limit was applied
775
+ if is_complete and should_cache:
597
776
  try:
598
- cache.cache_result(query_type, term_id, result, **kwargs)
777
+ cache.cache_result(query_type, cache_term_id, result, **kwargs)
599
778
  logger.debug(f"Cached complete result for {term_id}")
600
779
  except Exception as e:
601
780
  logger.debug(f"Failed to cache result: {e}")
781
+ elif not should_cache:
782
+ logger.debug(f"Not caching limited result for {term_id} (limit={limit})")
602
783
  else:
603
784
  logger.warning(f"Not caching incomplete result for {term_id}")
604
785
  else:
605
- try:
606
- cache.cache_result(query_type, term_id, result, **kwargs)
607
- except Exception as e:
608
- logger.debug(f"Failed to cache result: {e}")
786
+ # Only cache if no limit was applied
787
+ if should_cache:
788
+ try:
789
+ cache.cache_result(query_type, cache_term_id, result, **kwargs)
790
+ except Exception as e:
791
+ logger.debug(f"Failed to cache result: {e}")
792
+ else:
793
+ logger.debug(f"Not caching limited result for {term_id} (limit={limit})")
609
794
 
610
795
  return result
611
796