PyPI - vfbquery - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

vfbquery 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

test/readme_parser.py +35 -1
test/term_info_queries_test.py +11 -11
test/test_dataset_template_queries.py +138 -0
test/test_default_caching.py +15 -11
test/test_expression_overlaps.py +183 -0
test/test_expression_pattern_fragments.py +123 -0
test/test_images_neurons.py +152 -0
test/test_images_that_develop_from.py +112 -0
test/test_lineage_clones_in.py +190 -0
test/test_nblast_queries.py +124 -0
test/test_neuron_classes_fasciculating.py +187 -0
test/test_neuron_inputs.py +193 -0
test/test_neuron_neuron_connectivity.py +89 -0
test/test_neuron_region_connectivity.py +117 -0
test/test_neurons_part_here.py +204 -0
test/test_new_owlery_queries.py +282 -0
test/test_publication_transgene_queries.py +101 -0
test/test_query_performance.py +743 -0
test/test_similar_morphology.py +177 -0
test/test_tracts_nerves_innervating.py +188 -0
test/test_transcriptomics.py +223 -0
vfbquery/__init__.py +22 -1
vfbquery/neo4j_client.py +120 -0
vfbquery/owlery_client.py +463 -0
vfbquery/solr_fetcher.py +1 -1
vfbquery/solr_result_cache.py +238 -53
vfbquery/vfb_queries.py +2969 -638
{vfbquery-0.4.0.dist-info → vfbquery-0.5.0.dist-info}/METADATA +1023 -65
vfbquery-0.5.0.dist-info/RECORD +39 -0
vfbquery-0.4.0.dist-info/RECORD +0 -19
{vfbquery-0.4.0.dist-info → vfbquery-0.5.0.dist-info}/LICENSE +0 -0
{vfbquery-0.4.0.dist-info → vfbquery-0.5.0.dist-info}/WHEEL +0 -0
{vfbquery-0.4.0.dist-info → vfbquery-0.5.0.dist-info}/top_level.txt +0 -0

vfbquery/solr_result_cache.py CHANGED Viewed

@@ -18,6 +18,7 @@ from datetime import datetime, timedelta
 from typing import Dict, Any, Optional, List
 import logging
 from dataclasses import dataclass, asdict
+import pandas as pd
 from vfbquery.term_info_queries import NumpyEncoder
 logger = logging.getLogger(__name__)
@@ -95,11 +96,12 @@ class SolrResultCache:
             Cached result or None if not found/expired
         """
         try:
-            # Query for cache document with prefixed ID
-            cache_doc_id = f"vfb_query_{term_id}"
+            # Query for cache document with prefixed ID including query type
+            # This ensures different query types for the same term have separate cache entries
+            cache_doc_id = f"vfb_query_{query_type}_{term_id}"
             response = requests.get(f"{self.cache_url}/select", params={
-                "q": f"id:{cache_doc_id} AND query_type:{query_type}",
+                "q": f"id:{cache_doc_id}",
                 "fl": "cache_data",
                 "wt": "json"
             }, timeout=5)  # Short timeout for cache lookups
@@ -161,6 +163,14 @@ class SolrResultCache:
                     logger.warning(f"Failed to parse cached result for {term_id}")
                     return None
+            # IMPORTANT: Validate cached result - reject error results (count=-1)
+            # This ensures old cached errors get retried when the service is working again
+            if isinstance(result, dict) and 'count' in result:
+                if result.get('count', -1) < 0:
+                    logger.warning(f"Rejecting cached error result for {query_type}({term_id}): count={result.get('count')}")
+                    self._clear_expired_cache_document(cache_doc_id)
+                    return None
             logger.info(f"Cache hit for {query_type}({term_id})")
             return result
@@ -194,8 +204,9 @@ class SolrResultCache:
             if not cached_data:
                 return False  # Result too large or other issue
-            # Create cache document with prefixed ID
-            cache_doc_id = f"vfb_query_{term_id}"
+            # Create cache document with prefixed ID including query type
+            # This ensures different query types for the same term have separate cache entries
+            cache_doc_id = f"vfb_query_{query_type}_{term_id}"
             cache_doc = {
                 "id": cache_doc_id,
@@ -240,6 +251,37 @@ class SolrResultCache:
         except Exception as e:
             logger.debug(f"Failed to clear expired cache document: {e}")
+    def clear_cache_entry(self, query_type: str, term_id: str) -> bool:
+        """
+        Manually clear a specific cache entry to force refresh
+        Args:
+            query_type: Type of query ('term_info', 'instances', etc.)
+            term_id: Term identifier
+        Returns:
+            True if successfully cleared, False otherwise
+        """
+        try:
+            # Include query_type in cache document ID to match storage format
+            cache_doc_id = f"vfb_query_{query_type}_{term_id}"
+            response = requests.post(
+                f"{self.cache_url}/update",
+                data=f'<delete><id>{cache_doc_id}</id></delete>',
+                headers={"Content-Type": "application/xml"},
+                params={"commit": "true"},  # Commit immediately to ensure it's cleared
+                timeout=5
+            )
+            if response.status_code == 200:
+                logger.info(f"Cleared cache entry for {query_type}({term_id})")
+                return True
+            else:
+                logger.error(f"Failed to clear cache entry: HTTP {response.status_code}")
+                return False
+        except Exception as e:
+            logger.error(f"Error clearing cache entry: {e}")
+            return False
     def _increment_cache_hit_count(self, cache_doc_id: str, current_count: int):
         """Increment hit count for cache document (background operation)"""
         try:
@@ -269,10 +311,11 @@ class SolrResultCache:
             Dictionary with cache age info or None if not cached
         """
         try:
-            cache_doc_id = f"vfb_query_{term_id}"
+            # Include query_type in cache document ID to match storage format
+            cache_doc_id = f"vfb_query_{query_type}_{term_id}"
             response = requests.get(f"{self.cache_url}/select", params={
-                "q": f"id:{cache_doc_id} AND query_type:{query_type}",
+                "q": f"id:{cache_doc_id}",
                 "fl": "cache_data,hit_count,last_accessed",
                 "wt": "json"
             }, timeout=5)
@@ -533,79 +576,221 @@ def with_solr_cache(query_type: str):
     Usage:
         @with_solr_cache('term_info')
-        def get_term_info(short_form, **kwargs):
+        def get_term_info(short_form, force_refresh=False, **kwargs):
             # ... existing implementation
+    The decorated function can accept a 'force_refresh' parameter to bypass cache.
     """
     def decorator(func):
         def wrapper(*args, **kwargs):
+            # Check if force_refresh is requested (pop it before passing to function)
+            force_refresh = kwargs.pop('force_refresh', False)
+            # Check if limit is applied - don't cache limited results as they're incomplete
+            limit = kwargs.get('limit', -1)
+            should_cache = (limit == -1)  # Only cache when getting all results (limit=-1)
+            # For neuron_neuron_connectivity_query, only cache when all parameters are defaults
+            if query_type == 'neuron_neuron_connectivity_query':
+                min_weight = kwargs.get('min_weight', 0)
+                direction = kwargs.get('direction', 'both')
+                should_cache = should_cache and (min_weight == 0) and (direction == 'both')
             # Extract term_id from first argument or kwargs
             term_id = args[0] if args else kwargs.get('short_form') or kwargs.get('term_id')
+            # For functions like get_templates that don't have a term_id, use query_type as cache key
             if not term_id:
-                logger.warning("No term_id found for caching")
-                return func(*args, **kwargs)
+                if query_type == 'templates':
+                    # Use a fixed cache key for templates since it doesn't take a term_id
+                    term_id = 'all_templates'
+                else:
+                    logger.warning(f"No term_id found for caching {query_type}")
+                    return func(*args, **kwargs)
+            # Include preview parameter in cache key for term_info queries
+            # This ensures preview=True and preview=False have separate cache entries
+            cache_term_id = term_id
+            if query_type == 'term_info':
+                preview = kwargs.get('preview', True)  # Default is True
+                cache_term_id = f"{term_id}_preview_{preview}"
+            # Include return_dataframe parameter in cache key for queries that support it
+            # This ensures DataFrame and dict formats are cached separately
+            if query_type in ['instances', 'neurons_part_here', 'neurons_synaptic',
+                             'neurons_presynaptic', 'neurons_postsynaptic',
+                             'components_of', 'parts_of', 'subclasses_of',
+                             'neuron_classes_fasciculating_here', 'tracts_nerves_innervating_here',
+                             'lineage_clones_in', 'images_neurons', 'images_that_develop_from',
+                             'expression_pattern_fragments', 'neuron_neuron_connectivity_query']:
+                return_dataframe = kwargs.get('return_dataframe', True)  # Default is True
+                cache_term_id = f"{cache_term_id}_df_{return_dataframe}"
             cache = get_solr_cache()
-            # Try cache first
-            cached_result = cache.get_cached_result(query_type, term_id, **kwargs)
-            if cached_result is not None:
-                # Validate that cached result has essential fields for term_info
-                if query_type == 'term_info':
-                    is_valid = (cached_result and isinstance(cached_result, dict) and
-                               cached_result.get('Id') and cached_result.get('Name'))
-                    # Additional validation for query results
-                    if is_valid and 'Queries' in cached_result:
-                        logger.debug(f"Validating {len(cached_result['Queries'])} queries for {term_id}")
-                        for i, query in enumerate(cached_result['Queries']):
-                            count = query.get('count', 0)
-                            preview_results = query.get('preview_results')
-                            headers = preview_results.get('headers', []) if isinstance(preview_results, dict) else []
-                            logger.debug(f"Query {i}: count={count}, preview_results_type={type(preview_results)}, headers={headers}")
-                            # Check if query has unrealistic count (0 or -1) which indicates failed execution
-                            if count <= 0:
-                                is_valid = False
-                                logger.debug(f"Cached result has invalid query count {count} for {term_id}")
-                                break
-                            # Check if preview_results is missing or has empty headers when it should have data
-                            if not isinstance(preview_results, dict) or not headers:
-                                is_valid = False
-                                logger.debug(f"Cached result has invalid preview_results structure for {term_id}")
-                                break
+            # Clear cache if force_refresh is True
+            if force_refresh:
+                logger.info(f"Force refresh requested for {query_type}({term_id})")
+                cache.clear_cache_entry(query_type, cache_term_id)
+            # Try cache first (will be empty if force_refresh was True)
+            # OPTIMIZATION: If requesting limited results, check if full results are cached
+            # If yes, we can extract the limited rows from the cached full results
+            if not force_refresh:
+                # First try to get cached result matching the exact query (including limit)
+                if should_cache:
+                    cached_result = cache.get_cached_result(query_type, cache_term_id, **kwargs)
+                else:
+                    # For limited queries, try to get full cached results instead
+                    full_kwargs = kwargs.copy()
+                    full_kwargs['limit'] = -1  # Get full results
+                    cached_result = cache.get_cached_result(query_type, cache_term_id, **full_kwargs)
-                    if is_valid:
-                        logger.debug(f"Using valid cached result for {term_id}")
-                        return cached_result
+                    # If we got full cached results, extract the limited portion
+                    if cached_result is not None and limit > 0:
+                        logger.debug(f"Extracting first {limit} rows from cached full results for {term_id}")
+                        # Extract limited rows based on result type
+                        if isinstance(cached_result, dict) and 'rows' in cached_result:
+                            cached_result = {
+                                'headers': cached_result.get('headers', {}),
+                                'rows': cached_result['rows'][:limit],
+                                'count': cached_result.get('count', len(cached_result.get('rows', [])))
+                            }
+                        elif isinstance(cached_result, pd.DataFrame):
+                            # Keep the full count but limit the rows
+                            original_count = len(cached_result)
+                            cached_result = cached_result.head(limit)
+                            # Add count attribute if possible
+                            if hasattr(cached_result, '_metadata'):
+                                cached_result._metadata['count'] = original_count
+                if cached_result is not None:
+                    # Validate that cached result has essential fields for term_info
+                    if query_type == 'term_info':
+                        is_valid = (cached_result and isinstance(cached_result, dict) and
+                                   cached_result.get('Id') and cached_result.get('Name'))
+                        # Additional validation for query results - only when preview=True
+                        preview = kwargs.get('preview', True)  # Default is True
+                        if is_valid and preview and 'Queries' in cached_result:
+                            logger.debug(f"Validating {len(cached_result['Queries'])} queries for {term_id}")
+                            for i, query in enumerate(cached_result['Queries']):
+                                count = query.get('count', -1)  # Default to -1 if missing
+                                preview_results = query.get('preview_results')
+                                headers = preview_results.get('headers', []) if isinstance(preview_results, dict) else []
+                                logger.debug(f"Query {i}: count={count}, preview_results_type={type(preview_results)}, headers={headers}")
+                                # Check if query has error count (-1) which indicates failed execution
+                                # Note: count of 0 is valid - it means "no matches found"
+                                if count < 0:
+                                    is_valid = False
+                                    logger.debug(f"Cached result has error query count {count} for {term_id}")
+                                    break
+                                # Check if preview_results is missing or has empty headers when it should have data
+                                if not isinstance(preview_results, dict) or not headers:
+                                    is_valid = False
+                                    logger.debug(f"Cached result has invalid preview_results structure for {term_id}")
+                                    break
+                        if is_valid:
+                            logger.debug(f"Using valid cached result for {term_id}")
+                            return cached_result
+                        else:
+                            logger.warning(f"Cached result incomplete for {term_id}, re-executing function")
+                            # Don't return the incomplete cached result, continue to execute function
                     else:
-                        logger.warning(f"Cached result incomplete for {term_id}, re-executing function")
-                        # Don't return the incomplete cached result, continue to execute function
-                else:
-                    return cached_result
+                        return cached_result
             # Execute function and cache result
             result = func(*args, **kwargs)
             # Cache the result asynchronously to avoid blocking
-            if result:
+            # Handle DataFrame, dict, and other result types properly
+            result_is_valid = False
+            result_is_error = False  # Track if result is an error that should clear cache
+            if result is not None:
+                if hasattr(result, 'empty'):  # DataFrame
+                    result_is_valid = not result.empty
+                elif isinstance(result, dict):
+                    # For dict results, check if it's not an error result (count != -1)
+                    # Error results should not be cached
+                    if 'count' in result:
+                        count_value = result.get('count', -1)
+                        result_is_valid = count_value >= 0  # Don't cache errors (count=-1)
+                        result_is_error = count_value < 0  # Mark as error if count is negative
+                    else:
+                        result_is_valid = bool(result)  # For dicts without count field
+                elif isinstance(result, (list, str)):
+                    result_is_valid = len(result) > 0
+                else:
+                    result_is_valid = True
+            # If result is an error, actively clear any existing cache entry
+            # This ensures that transient failures don't get stuck in cache
+            if result_is_error:
+                logger.warning(f"Query returned error result for {query_type}({term_id}), clearing cache entry")
+                try:
+                    cache.clear_cache_entry(query_type, cache_term_id)
+                except Exception as e:
+                    logger.debug(f"Failed to clear cache entry: {e}")
+            if result_is_valid:
                 # Validate result before caching for term_info
                 if query_type == 'term_info':
-                    if (result and isinstance(result, dict) and
-                        result.get('Id') and result.get('Name')):
+                    # Basic validation: must have Id and Name
+                    is_complete = (result and isinstance(result, dict) and
+                                  result.get('Id') and result.get('Name'))
+                    # Additional validation when preview=True: check if queries have results
+                    # We allow caching even if some queries failed (count=-1) as long as the core term_info is valid
+                    # This is because some query functions may not be implemented yet or may legitimately fail
+                    if is_complete:
+                        preview = kwargs.get('preview', True)
+                        if preview and 'Queries' in result and result['Queries']:
+                            # Count how many queries have valid results vs errors
+                            valid_queries = 0
+                            failed_queries = 0
+                            for query in result['Queries']:
+                                count = query.get('count', -1)
+                                preview_results = query.get('preview_results')
+                                # Count queries with valid results (count >= 0)
+                                if count >= 0 and isinstance(preview_results, dict):
+                                    valid_queries += 1
+                                else:
+                                    failed_queries += 1
+                            # Only reject if ALL queries failed - at least one must succeed
+                            if valid_queries == 0 and failed_queries > 0:
+                                is_complete = False
+                                logger.warning(f"Not caching result for {term_id}: all {failed_queries} queries failed")
+                            elif failed_queries > 0:
+                                logger.debug(f"Caching result for {term_id} with {valid_queries} valid queries ({failed_queries} failed)")
+                    # Only cache if result is complete AND no limit was applied
+                    if is_complete and should_cache:
                         try:
-                            cache.cache_result(query_type, term_id, result, **kwargs)
+                            cache.cache_result(query_type, cache_term_id, result, **kwargs)
                             logger.debug(f"Cached complete result for {term_id}")
                         except Exception as e:
                             logger.debug(f"Failed to cache result: {e}")
+                    elif not should_cache:
+                        logger.debug(f"Not caching limited result for {term_id} (limit={limit})")
                     else:
                         logger.warning(f"Not caching incomplete result for {term_id}")
                 else:
-                    try:
-                        cache.cache_result(query_type, term_id, result, **kwargs)
-                    except Exception as e:
-                        logger.debug(f"Failed to cache result: {e}")
+                    # Only cache if no limit was applied
+                    if should_cache:
+                        try:
+                            cache.cache_result(query_type, cache_term_id, result, **kwargs)
+                        except Exception as e:
+                            logger.debug(f"Failed to cache result: {e}")
+                    else:
+                        logger.debug(f"Not caching limited result for {term_id} (limit={limit})")
             return result

vfbquery 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

vfbquery 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl