vfbquery 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test/readme_parser.py +15 -9
- test/term_info_queries_test.py +4 -4
- test/test_dataset_template_queries.py +138 -0
- test/test_default_caching.py +15 -11
- test/test_expression_overlaps.py +183 -0
- test/test_expression_pattern_fragments.py +123 -0
- test/test_images_neurons.py +152 -0
- test/test_images_that_develop_from.py +112 -0
- test/test_lineage_clones_in.py +190 -0
- test/test_nblast_queries.py +124 -0
- test/test_neuron_classes_fasciculating.py +187 -0
- test/test_neuron_inputs.py +193 -0
- test/test_neuron_neuron_connectivity.py +89 -0
- test/test_neuron_region_connectivity.py +117 -0
- test/test_neurons_part_here.py +204 -0
- test/test_new_owlery_queries.py +282 -0
- test/test_publication_transgene_queries.py +101 -0
- test/test_query_performance.py +743 -0
- test/test_similar_morphology.py +177 -0
- test/test_tracts_nerves_innervating.py +188 -0
- test/test_transcriptomics.py +223 -0
- vfbquery/__init__.py +1 -1
- vfbquery/neo4j_client.py +120 -0
- vfbquery/owlery_client.py +463 -0
- vfbquery/solr_fetcher.py +1 -1
- vfbquery/solr_result_cache.py +163 -24
- vfbquery/vfb_queries.py +2936 -625
- {vfbquery-0.4.1.dist-info → vfbquery-0.5.0.dist-info}/METADATA +1007 -49
- vfbquery-0.5.0.dist-info/RECORD +39 -0
- vfbquery-0.4.1.dist-info/RECORD +0 -19
- {vfbquery-0.4.1.dist-info → vfbquery-0.5.0.dist-info}/LICENSE +0 -0
- {vfbquery-0.4.1.dist-info → vfbquery-0.5.0.dist-info}/WHEEL +0 -0
- {vfbquery-0.4.1.dist-info → vfbquery-0.5.0.dist-info}/top_level.txt +0 -0
vfbquery/solr_result_cache.py
CHANGED
|
@@ -18,6 +18,7 @@ from datetime import datetime, timedelta
|
|
|
18
18
|
from typing import Dict, Any, Optional, List
|
|
19
19
|
import logging
|
|
20
20
|
from dataclasses import dataclass, asdict
|
|
21
|
+
import pandas as pd
|
|
21
22
|
from vfbquery.term_info_queries import NumpyEncoder
|
|
22
23
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
@@ -95,11 +96,12 @@ class SolrResultCache:
|
|
|
95
96
|
Cached result or None if not found/expired
|
|
96
97
|
"""
|
|
97
98
|
try:
|
|
98
|
-
# Query for cache document with prefixed ID
|
|
99
|
-
|
|
99
|
+
# Query for cache document with prefixed ID including query type
|
|
100
|
+
# This ensures different query types for the same term have separate cache entries
|
|
101
|
+
cache_doc_id = f"vfb_query_{query_type}_{term_id}"
|
|
100
102
|
|
|
101
103
|
response = requests.get(f"{self.cache_url}/select", params={
|
|
102
|
-
"q": f"id:{cache_doc_id}
|
|
104
|
+
"q": f"id:{cache_doc_id}",
|
|
103
105
|
"fl": "cache_data",
|
|
104
106
|
"wt": "json"
|
|
105
107
|
}, timeout=5) # Short timeout for cache lookups
|
|
@@ -161,6 +163,14 @@ class SolrResultCache:
|
|
|
161
163
|
logger.warning(f"Failed to parse cached result for {term_id}")
|
|
162
164
|
return None
|
|
163
165
|
|
|
166
|
+
# IMPORTANT: Validate cached result - reject error results (count=-1)
|
|
167
|
+
# This ensures old cached errors get retried when the service is working again
|
|
168
|
+
if isinstance(result, dict) and 'count' in result:
|
|
169
|
+
if result.get('count', -1) < 0:
|
|
170
|
+
logger.warning(f"Rejecting cached error result for {query_type}({term_id}): count={result.get('count')}")
|
|
171
|
+
self._clear_expired_cache_document(cache_doc_id)
|
|
172
|
+
return None
|
|
173
|
+
|
|
164
174
|
logger.info(f"Cache hit for {query_type}({term_id})")
|
|
165
175
|
return result
|
|
166
176
|
|
|
@@ -194,8 +204,9 @@ class SolrResultCache:
|
|
|
194
204
|
if not cached_data:
|
|
195
205
|
return False # Result too large or other issue
|
|
196
206
|
|
|
197
|
-
# Create cache document with prefixed ID
|
|
198
|
-
|
|
207
|
+
# Create cache document with prefixed ID including query type
|
|
208
|
+
# This ensures different query types for the same term have separate cache entries
|
|
209
|
+
cache_doc_id = f"vfb_query_{query_type}_{term_id}"
|
|
199
210
|
|
|
200
211
|
cache_doc = {
|
|
201
212
|
"id": cache_doc_id,
|
|
@@ -252,7 +263,8 @@ class SolrResultCache:
|
|
|
252
263
|
True if successfully cleared, False otherwise
|
|
253
264
|
"""
|
|
254
265
|
try:
|
|
255
|
-
|
|
266
|
+
# Include query_type in cache document ID to match storage format
|
|
267
|
+
cache_doc_id = f"vfb_query_{query_type}_{term_id}"
|
|
256
268
|
response = requests.post(
|
|
257
269
|
f"{self.cache_url}/update",
|
|
258
270
|
data=f'<delete><id>{cache_doc_id}</id></delete>',
|
|
@@ -299,10 +311,11 @@ class SolrResultCache:
|
|
|
299
311
|
Dictionary with cache age info or None if not cached
|
|
300
312
|
"""
|
|
301
313
|
try:
|
|
302
|
-
|
|
314
|
+
# Include query_type in cache document ID to match storage format
|
|
315
|
+
cache_doc_id = f"vfb_query_{query_type}_{term_id}"
|
|
303
316
|
|
|
304
317
|
response = requests.get(f"{self.cache_url}/select", params={
|
|
305
|
-
"q": f"id:{cache_doc_id}
|
|
318
|
+
"q": f"id:{cache_doc_id}",
|
|
306
319
|
"fl": "cache_data,hit_count,last_accessed",
|
|
307
320
|
"wt": "json"
|
|
308
321
|
}, timeout=5)
|
|
@@ -573,6 +586,16 @@ def with_solr_cache(query_type: str):
|
|
|
573
586
|
# Check if force_refresh is requested (pop it before passing to function)
|
|
574
587
|
force_refresh = kwargs.pop('force_refresh', False)
|
|
575
588
|
|
|
589
|
+
# Check if limit is applied - don't cache limited results as they're incomplete
|
|
590
|
+
limit = kwargs.get('limit', -1)
|
|
591
|
+
should_cache = (limit == -1) # Only cache when getting all results (limit=-1)
|
|
592
|
+
|
|
593
|
+
# For neuron_neuron_connectivity_query, only cache when all parameters are defaults
|
|
594
|
+
if query_type == 'neuron_neuron_connectivity_query':
|
|
595
|
+
min_weight = kwargs.get('min_weight', 0)
|
|
596
|
+
direction = kwargs.get('direction', 'both')
|
|
597
|
+
should_cache = should_cache and (min_weight == 0) and (direction == 'both')
|
|
598
|
+
|
|
576
599
|
# Extract term_id from first argument or kwargs
|
|
577
600
|
term_id = args[0] if args else kwargs.get('short_form') or kwargs.get('term_id')
|
|
578
601
|
|
|
@@ -585,36 +608,85 @@ def with_solr_cache(query_type: str):
|
|
|
585
608
|
logger.warning(f"No term_id found for caching {query_type}")
|
|
586
609
|
return func(*args, **kwargs)
|
|
587
610
|
|
|
611
|
+
# Include preview parameter in cache key for term_info queries
|
|
612
|
+
# This ensures preview=True and preview=False have separate cache entries
|
|
613
|
+
cache_term_id = term_id
|
|
614
|
+
if query_type == 'term_info':
|
|
615
|
+
preview = kwargs.get('preview', True) # Default is True
|
|
616
|
+
cache_term_id = f"{term_id}_preview_{preview}"
|
|
617
|
+
|
|
618
|
+
# Include return_dataframe parameter in cache key for queries that support it
|
|
619
|
+
# This ensures DataFrame and dict formats are cached separately
|
|
620
|
+
if query_type in ['instances', 'neurons_part_here', 'neurons_synaptic',
|
|
621
|
+
'neurons_presynaptic', 'neurons_postsynaptic',
|
|
622
|
+
'components_of', 'parts_of', 'subclasses_of',
|
|
623
|
+
'neuron_classes_fasciculating_here', 'tracts_nerves_innervating_here',
|
|
624
|
+
'lineage_clones_in', 'images_neurons', 'images_that_develop_from',
|
|
625
|
+
'expression_pattern_fragments', 'neuron_neuron_connectivity_query']:
|
|
626
|
+
return_dataframe = kwargs.get('return_dataframe', True) # Default is True
|
|
627
|
+
cache_term_id = f"{cache_term_id}_df_{return_dataframe}"
|
|
628
|
+
|
|
588
629
|
cache = get_solr_cache()
|
|
589
630
|
|
|
590
631
|
# Clear cache if force_refresh is True
|
|
591
632
|
if force_refresh:
|
|
592
633
|
logger.info(f"Force refresh requested for {query_type}({term_id})")
|
|
593
|
-
cache.clear_cache_entry(query_type,
|
|
634
|
+
cache.clear_cache_entry(query_type, cache_term_id)
|
|
594
635
|
|
|
595
636
|
# Try cache first (will be empty if force_refresh was True)
|
|
637
|
+
# OPTIMIZATION: If requesting limited results, check if full results are cached
|
|
638
|
+
# If yes, we can extract the limited rows from the cached full results
|
|
596
639
|
if not force_refresh:
|
|
597
|
-
|
|
640
|
+
# First try to get cached result matching the exact query (including limit)
|
|
641
|
+
if should_cache:
|
|
642
|
+
cached_result = cache.get_cached_result(query_type, cache_term_id, **kwargs)
|
|
643
|
+
else:
|
|
644
|
+
# For limited queries, try to get full cached results instead
|
|
645
|
+
full_kwargs = kwargs.copy()
|
|
646
|
+
full_kwargs['limit'] = -1 # Get full results
|
|
647
|
+
cached_result = cache.get_cached_result(query_type, cache_term_id, **full_kwargs)
|
|
648
|
+
|
|
649
|
+
# If we got full cached results, extract the limited portion
|
|
650
|
+
if cached_result is not None and limit > 0:
|
|
651
|
+
logger.debug(f"Extracting first {limit} rows from cached full results for {term_id}")
|
|
652
|
+
|
|
653
|
+
# Extract limited rows based on result type
|
|
654
|
+
if isinstance(cached_result, dict) and 'rows' in cached_result:
|
|
655
|
+
cached_result = {
|
|
656
|
+
'headers': cached_result.get('headers', {}),
|
|
657
|
+
'rows': cached_result['rows'][:limit],
|
|
658
|
+
'count': cached_result.get('count', len(cached_result.get('rows', [])))
|
|
659
|
+
}
|
|
660
|
+
elif isinstance(cached_result, pd.DataFrame):
|
|
661
|
+
# Keep the full count but limit the rows
|
|
662
|
+
original_count = len(cached_result)
|
|
663
|
+
cached_result = cached_result.head(limit)
|
|
664
|
+
# Add count attribute if possible
|
|
665
|
+
if hasattr(cached_result, '_metadata'):
|
|
666
|
+
cached_result._metadata['count'] = original_count
|
|
667
|
+
|
|
598
668
|
if cached_result is not None:
|
|
599
669
|
# Validate that cached result has essential fields for term_info
|
|
600
670
|
if query_type == 'term_info':
|
|
601
671
|
is_valid = (cached_result and isinstance(cached_result, dict) and
|
|
602
672
|
cached_result.get('Id') and cached_result.get('Name'))
|
|
603
673
|
|
|
604
|
-
# Additional validation for query results
|
|
605
|
-
|
|
674
|
+
# Additional validation for query results - only when preview=True
|
|
675
|
+
preview = kwargs.get('preview', True) # Default is True
|
|
676
|
+
if is_valid and preview and 'Queries' in cached_result:
|
|
606
677
|
logger.debug(f"Validating {len(cached_result['Queries'])} queries for {term_id}")
|
|
607
678
|
for i, query in enumerate(cached_result['Queries']):
|
|
608
|
-
count = query.get('count',
|
|
679
|
+
count = query.get('count', -1) # Default to -1 if missing
|
|
609
680
|
preview_results = query.get('preview_results')
|
|
610
681
|
headers = preview_results.get('headers', []) if isinstance(preview_results, dict) else []
|
|
611
682
|
|
|
612
683
|
logger.debug(f"Query {i}: count={count}, preview_results_type={type(preview_results)}, headers={headers}")
|
|
613
684
|
|
|
614
|
-
# Check if query has
|
|
615
|
-
|
|
685
|
+
# Check if query has error count (-1) which indicates failed execution
|
|
686
|
+
# Note: count of 0 is valid - it means "no matches found"
|
|
687
|
+
if count < 0:
|
|
616
688
|
is_valid = False
|
|
617
|
-
logger.debug(f"Cached result has
|
|
689
|
+
logger.debug(f"Cached result has error query count {count} for {term_id}")
|
|
618
690
|
break
|
|
619
691
|
# Check if preview_results is missing or has empty headers when it should have data
|
|
620
692
|
if not isinstance(preview_results, dict) or not headers:
|
|
@@ -635,23 +707,90 @@ def with_solr_cache(query_type: str):
|
|
|
635
707
|
result = func(*args, **kwargs)
|
|
636
708
|
|
|
637
709
|
# Cache the result asynchronously to avoid blocking
|
|
638
|
-
|
|
710
|
+
# Handle DataFrame, dict, and other result types properly
|
|
711
|
+
result_is_valid = False
|
|
712
|
+
result_is_error = False # Track if result is an error that should clear cache
|
|
713
|
+
|
|
714
|
+
if result is not None:
|
|
715
|
+
if hasattr(result, 'empty'): # DataFrame
|
|
716
|
+
result_is_valid = not result.empty
|
|
717
|
+
elif isinstance(result, dict):
|
|
718
|
+
# For dict results, check if it's not an error result (count != -1)
|
|
719
|
+
# Error results should not be cached
|
|
720
|
+
if 'count' in result:
|
|
721
|
+
count_value = result.get('count', -1)
|
|
722
|
+
result_is_valid = count_value >= 0 # Don't cache errors (count=-1)
|
|
723
|
+
result_is_error = count_value < 0 # Mark as error if count is negative
|
|
724
|
+
else:
|
|
725
|
+
result_is_valid = bool(result) # For dicts without count field
|
|
726
|
+
elif isinstance(result, (list, str)):
|
|
727
|
+
result_is_valid = len(result) > 0
|
|
728
|
+
else:
|
|
729
|
+
result_is_valid = True
|
|
730
|
+
|
|
731
|
+
# If result is an error, actively clear any existing cache entry
|
|
732
|
+
# This ensures that transient failures don't get stuck in cache
|
|
733
|
+
if result_is_error:
|
|
734
|
+
logger.warning(f"Query returned error result for {query_type}({term_id}), clearing cache entry")
|
|
735
|
+
try:
|
|
736
|
+
cache.clear_cache_entry(query_type, cache_term_id)
|
|
737
|
+
except Exception as e:
|
|
738
|
+
logger.debug(f"Failed to clear cache entry: {e}")
|
|
739
|
+
|
|
740
|
+
if result_is_valid:
|
|
639
741
|
# Validate result before caching for term_info
|
|
640
742
|
if query_type == 'term_info':
|
|
641
|
-
|
|
642
|
-
|
|
743
|
+
# Basic validation: must have Id and Name
|
|
744
|
+
is_complete = (result and isinstance(result, dict) and
|
|
745
|
+
result.get('Id') and result.get('Name'))
|
|
746
|
+
|
|
747
|
+
# Additional validation when preview=True: check if queries have results
|
|
748
|
+
# We allow caching even if some queries failed (count=-1) as long as the core term_info is valid
|
|
749
|
+
# This is because some query functions may not be implemented yet or may legitimately fail
|
|
750
|
+
if is_complete:
|
|
751
|
+
preview = kwargs.get('preview', True)
|
|
752
|
+
if preview and 'Queries' in result and result['Queries']:
|
|
753
|
+
# Count how many queries have valid results vs errors
|
|
754
|
+
valid_queries = 0
|
|
755
|
+
failed_queries = 0
|
|
756
|
+
|
|
757
|
+
for query in result['Queries']:
|
|
758
|
+
count = query.get('count', -1)
|
|
759
|
+
preview_results = query.get('preview_results')
|
|
760
|
+
|
|
761
|
+
# Count queries with valid results (count >= 0)
|
|
762
|
+
if count >= 0 and isinstance(preview_results, dict):
|
|
763
|
+
valid_queries += 1
|
|
764
|
+
else:
|
|
765
|
+
failed_queries += 1
|
|
766
|
+
|
|
767
|
+
# Only reject if ALL queries failed - at least one must succeed
|
|
768
|
+
if valid_queries == 0 and failed_queries > 0:
|
|
769
|
+
is_complete = False
|
|
770
|
+
logger.warning(f"Not caching result for {term_id}: all {failed_queries} queries failed")
|
|
771
|
+
elif failed_queries > 0:
|
|
772
|
+
logger.debug(f"Caching result for {term_id} with {valid_queries} valid queries ({failed_queries} failed)")
|
|
773
|
+
|
|
774
|
+
# Only cache if result is complete AND no limit was applied
|
|
775
|
+
if is_complete and should_cache:
|
|
643
776
|
try:
|
|
644
|
-
cache.cache_result(query_type,
|
|
777
|
+
cache.cache_result(query_type, cache_term_id, result, **kwargs)
|
|
645
778
|
logger.debug(f"Cached complete result for {term_id}")
|
|
646
779
|
except Exception as e:
|
|
647
780
|
logger.debug(f"Failed to cache result: {e}")
|
|
781
|
+
elif not should_cache:
|
|
782
|
+
logger.debug(f"Not caching limited result for {term_id} (limit={limit})")
|
|
648
783
|
else:
|
|
649
784
|
logger.warning(f"Not caching incomplete result for {term_id}")
|
|
650
785
|
else:
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
786
|
+
# Only cache if no limit was applied
|
|
787
|
+
if should_cache:
|
|
788
|
+
try:
|
|
789
|
+
cache.cache_result(query_type, cache_term_id, result, **kwargs)
|
|
790
|
+
except Exception as e:
|
|
791
|
+
logger.debug(f"Failed to cache result: {e}")
|
|
792
|
+
else:
|
|
793
|
+
logger.debug(f"Not caching limited result for {term_id} (limit={limit})")
|
|
655
794
|
|
|
656
795
|
return result
|
|
657
796
|
|