fast-csv-loader 2.1.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fast-csv-loader
3
- Version: 2.1.0
3
+ Version: 2.2.0
4
4
  Summary: A fast and memory efficient way to load large CSV files (Timeseries data) into Pandas
5
5
  Project-URL: Bug Tracker, https://github.com/BennyThadikaran/fast_csv_loader/issues
6
6
  Project-URL: Homepage, https://github.com/BennyThadikaran/fast_csv_loader
@@ -51,6 +51,45 @@ It also improves program execution time, when iterating or loading a large numbe
51
51
 
52
52
  [https://bennythadikaran.github.io/fast_csv_loader/](https://bennythadikaran.github.io/fast_csv_loader/)
53
53
 
54
+ ## Cached Loader (mtime-aware)
55
+
56
+ For workloads where the same files are read repeatedly — scanners looping
57
+ over symbol CSVs, dashboards re-rendering, rolling backtests — use
58
+ `cached_csv_loader`. It wraps `csv_loader` with an in-memory cache that
59
+ automatically invalidates when the file's modification time changes.
60
+
61
+ ```python
62
+ from fast_csv_loader import cached_csv_loader, cache_stats, invalidate_all
63
+ from pathlib import Path
64
+
65
+ # First call: reads from disk
66
+ df = cached_csv_loader(Path("AAPL.csv"), period=200)
67
+
68
+ # Subsequent calls on same file: served from cache (O(1))
69
+ df = cached_csv_loader(Path("AAPL.csv"), period=200)
70
+
71
+ # After your EOD job writes new data, the next call auto-invalidates
72
+ # (mtime changed on disk). For explicit control:
73
+ from fast_csv_loader import invalidate
74
+ invalidate("AAPL.csv") # drop one file
75
+ invalidate_all() # drop everything
76
+
77
+ # Observability
78
+ print(cache_stats())
79
+ # {'hits': 49, 'misses': 1, 'evictions': 0, 'size': 1, 'hit_rate': 98.0, 'max_size': 500}
80
+ ```
81
+
82
+ Benchmark on 133 small daily CSVs (~12 KB each), 5 repeat passes:
83
+
84
+ ```
85
+ csv_loader (no cache): ~555 ms
86
+ cached_csv_loader (warm): ~13 ms (~43x faster)
87
+ ```
88
+
89
+ The cache is process-local and thread-safe. Entries are evicted in
90
+ insertion order when the cache exceeds `max_size` (default 500). Adjust
91
+ with `set_max_cache_size(n)`.
92
+
54
93
  ## Performance
55
94
 
56
95
  Loading a portion of a large file is significantly faster than loading the entire file in memory.
@@ -22,6 +22,45 @@ It also improves program execution time, when iterating or loading a large numbe
22
22
 
23
23
  [https://bennythadikaran.github.io/fast_csv_loader/](https://bennythadikaran.github.io/fast_csv_loader/)
24
24
 
25
+ ## Cached Loader (mtime-aware)
26
+
27
+ For workloads where the same files are read repeatedly — scanners looping
28
+ over symbol CSVs, dashboards re-rendering, rolling backtests — use
29
+ `cached_csv_loader`. It wraps `csv_loader` with an in-memory cache that
30
+ automatically invalidates when the file's modification time changes.
31
+
32
+ ```python
33
+ from fast_csv_loader import cached_csv_loader, cache_stats, invalidate_all
34
+ from pathlib import Path
35
+
36
+ # First call: reads from disk
37
+ df = cached_csv_loader(Path("AAPL.csv"), period=200)
38
+
39
+ # Subsequent calls on same file: served from cache (O(1))
40
+ df = cached_csv_loader(Path("AAPL.csv"), period=200)
41
+
42
+ # After your EOD job writes new data, the next call auto-invalidates
43
+ # (mtime changed on disk). For explicit control:
44
+ from fast_csv_loader import invalidate
45
+ invalidate("AAPL.csv") # drop one file
46
+ invalidate_all() # drop everything
47
+
48
+ # Observability
49
+ print(cache_stats())
50
+ # {'hits': 49, 'misses': 1, 'evictions': 0, 'size': 1, 'hit_rate': 98.0, 'max_size': 500}
51
+ ```
52
+
53
+ Benchmark on 133 small daily CSVs (~12 KB each), 5 repeat passes:
54
+
55
+ ```
56
+ csv_loader (no cache): ~555 ms
57
+ cached_csv_loader (warm): ~13 ms (~43x faster)
58
+ ```
59
+
60
+ The cache is process-local and thread-safe. Entries are evicted in
61
+ insertion order when the cache exceeds `max_size` (default 500). Adjust
62
+ with `set_max_cache_size(n)`.
63
+
25
64
  ## Performance
26
65
 
27
66
  Loading a portion of a large file is significantly faster than loading the entire file in memory.
@@ -0,0 +1,17 @@
1
+ from fast_csv_loader.csv_loader import csv_loader
2
+ from fast_csv_loader.cached_loader import (
3
+ cached_csv_loader,
4
+ invalidate,
5
+ invalidate_all,
6
+ cache_stats,
7
+ set_max_cache_size,
8
+ )
9
+
10
+ __all__ = [
11
+ "csv_loader",
12
+ "cached_csv_loader",
13
+ "invalidate",
14
+ "invalidate_all",
15
+ "cache_stats",
16
+ "set_max_cache_size",
17
+ ]
@@ -0,0 +1,248 @@
1
+ """
2
+ cached_loader — mtime-aware in-memory caching wrapper around csv_loader.
3
+
4
+ When the same CSV file is read repeatedly (e.g. in a scanner loop, a
5
+ rolling backtest, or a dashboard re-render), re-parsing from disk every
6
+ time is wasteful. This module caches the parsed DataFrame and invalidates
7
+ it automatically when the file's modification time changes.
8
+
9
+ Typical use case: a trading scanner loops 50–200 stock CSVs every
10
+ few minutes, the files only change once a day after EOD sync. Without
11
+ caching, every loop re-parses every file.
12
+
13
+ Benchmark on 133 small daily CSVs (~12 KB each), 5 repeat passes:
14
+ csv_loader (no cache): ~555 ms
15
+ cached_csv_loader (warm): ~13 ms (~43x faster)
16
+
17
+ Usage:
18
+ from fast_csv_loader import cached_csv_loader
19
+
20
+ df = cached_csv_loader(Path("AAPL.csv"), period=200) # first call: disk
21
+ df = cached_csv_loader(Path("AAPL.csv"), period=200) # second call: cached
22
+
23
+ # After writing new data to the file, cache auto-invalidates on next read
24
+ # because the file mtime changed. For explicit invalidation:
25
+ from fast_csv_loader import invalidate, invalidate_all, cache_stats
26
+ invalidate("AAPL.csv")
27
+ invalidate_all()
28
+ stats = cache_stats()
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import threading
34
+ from datetime import datetime
35
+ from pathlib import Path
36
+ from typing import List, Optional
37
+
38
+ import pandas as pd
39
+
40
+ from fast_csv_loader.csv_loader import csv_loader
41
+
42
+ # Internal cache: absolute_path_str -> (mtime, dataframe)
43
+ _cache: dict = {}
44
+ _cache_lock = threading.Lock()
45
+ _stats = {"hits": 0, "misses": 0, "evictions": 0}
46
+
47
+ # Cap cache size to avoid unbounded memory growth in long-running processes.
48
+ # Entries evicted in insertion order (rough LRU — cheaper than a true LRU).
49
+ _MAX_CACHE_ENTRIES = 500
50
+
51
+
52
+ def _evict_if_full() -> None:
53
+ if len(_cache) > _MAX_CACHE_ENTRIES:
54
+ drop_count = max(1, _MAX_CACHE_ENTRIES // 5)
55
+ for k in list(_cache.keys())[:drop_count]:
56
+ _cache.pop(k, None)
57
+ _stats["evictions"] += 1
58
+
59
+
60
+ def cached_csv_loader(
61
+ file_path: Path,
62
+ period: int = 160,
63
+ end_date: Optional[datetime] = None,
64
+ date_format: Optional[str] = None,
65
+ use_columns: Optional[List[str]] = None,
66
+ chunk_size: int = 1024 * 6,
67
+ ) -> pd.DataFrame:
68
+ """
69
+ .. versionadded:: 2.2.0
70
+
71
+ Mtime-aware cached wrapper around ``csv_loader``.
72
+
73
+ Provides a drop-in replacement for ``csv_loader`` for cases where the
74
+ same CSV file may be read multiple times within the same process. Results
75
+ are cached based on file path, modification time, and selected query
76
+ parameters.
77
+
78
+ The cache key is composed of (file_path, end_date, date_format,
79
+ use_columns). The ``period`` parameter is NOT part of the cache key and
80
+ is applied after cache retrieval, meaning different ``period`` values
81
+ reuse the same cached DataFrame and only affect the returned slice.
82
+
83
+ If the underlying file has changed (based on mtime), the cache entry is
84
+ invalidated and the file is reloaded.
85
+
86
+ :param file_path: The path to the CSV file to be loaded.
87
+ :type file_path: pathlib.Path
88
+
89
+ :param period: Number of rows/candles to return from the end of the
90
+ dataset. Default is 160.
91
+ :type period: int
92
+
93
+ :param end_date: Load data up to this timestamp. If None, the most
94
+ recent data is used. If provided, loading is anchored to this date.
95
+ :type end_date: Optional[datetime]
96
+
97
+ :param date_format: Custom datetime format string used for parsing the
98
+ CSV date column if automatic parsing fails.
99
+ :type date_format: Optional[str]
100
+
101
+ :param use_columns: List of column names to load from the CSV file.
102
+ If None, all columns are loaded.
103
+ :type use_columns: Optional[List[str]]
104
+
105
+ :param chunk_size: Size of chunks (in bytes) used when reading the CSV
106
+ file. Default is 6144 bytes (6 KB).
107
+ :type chunk_size: int
108
+
109
+ :return: A DataFrame containing the requested slice of timeseries data.
110
+ :rtype: pd.DataFrame
111
+
112
+ :raise FileNotFoundError: If ``file_path`` does not exist.
113
+ """
114
+ if not file_path.exists():
115
+ raise FileNotFoundError(f"No such file or directory: '{file_path}'")
116
+
117
+ try:
118
+ mtime = file_path.stat().st_mtime
119
+ except OSError:
120
+ return pd.DataFrame()
121
+
122
+ use_columns_key = tuple(use_columns) if use_columns else None
123
+ cache_key = (str(file_path.resolve()), end_date, date_format, use_columns_key)
124
+
125
+ with _cache_lock:
126
+ entry = _cache.get(cache_key)
127
+ if entry and entry[0] == mtime:
128
+ _stats["hits"] += 1
129
+ df = entry[1]
130
+ return (
131
+ df.iloc[-period:].copy() if period and len(df) > period else df.copy()
132
+ )
133
+
134
+ # Cache miss — load with enough history that later calls with larger
135
+ # `period` values can still be served from cache. We load a generous
136
+ # buffer by passing period * 4 (min 1000) to the underlying loader.
137
+ load_period = max(period * 4, 1000) if period else 10_000
138
+ with _cache_lock:
139
+ _stats["misses"] += 1
140
+ df = csv_loader(
141
+ file_path,
142
+ period=load_period,
143
+ end_date=end_date,
144
+ date_format=date_format,
145
+ use_columns=use_columns,
146
+ chunk_size=chunk_size,
147
+ )
148
+
149
+ with _cache_lock:
150
+ _cache[cache_key] = (mtime, df)
151
+ _evict_if_full()
152
+
153
+ return df.iloc[-period:].copy() if period and len(df) > period else df.copy()
154
+
155
+
156
+ def invalidate(file_path) -> int:
157
+ """
158
+ .. versionadded:: 2.2.0
159
+
160
+ Drop all cache entries for a given file (any ``end_date`` / columns combination).
161
+
162
+ Useful after writing new data to disk when you want to ensure subsequent
163
+ reads do not return stale cached results. Otherwise, cache entries are
164
+ invalidated automatically based on file modification time.
165
+
166
+ :param file_path: Path of the file whose cache entries should be removed.
167
+ :type file_path: pathlib.Path | str
168
+
169
+ :return: Number of cache entries removed for the given file.
170
+ :rtype: int
171
+ """
172
+ target = str(Path(file_path).resolve())
173
+ with _cache_lock:
174
+ keys = [k for k in _cache if k[0] == target]
175
+ for k in keys:
176
+ _cache.pop(k, None)
177
+ return len(keys)
178
+
179
+
180
+ def invalidate_all() -> int:
181
+ """
182
+ .. versionadded:: 2.2.0
183
+
184
+ Drop all entries from the cache.
185
+
186
+ Useful for resetting cache state entirely, for example during testing or
187
+ after bulk data updates.
188
+
189
+ :return: Number of cache entries removed.
190
+ :rtype: int
191
+ """
192
+ with _cache_lock:
193
+ n = len(_cache)
194
+ _cache.clear()
195
+ return n
196
+
197
+
198
+ def cache_stats() -> dict:
199
+ """
200
+ .. versionadded:: 2.2.0
201
+
202
+ Return cache observability metrics including hit/miss counts, current
203
+ cache size, and hit rate.
204
+
205
+ :return: Dictionary containing cache statistics:
206
+
207
+ - ``hits``: Number of cache hits
208
+ - ``misses``: Number of cache misses
209
+ - ``evictions``: Number of evicted entries
210
+ - ``size``: Current number of cached entries
211
+ - ``hit_rate``: Cache hit rate as a percentage (rounded to 1 decimal)
212
+ - ``max_size``: Maximum allowed cache size
213
+
214
+ :rtype: dict
215
+ """
216
+ with _cache_lock:
217
+ total = _stats["hits"] + _stats["misses"]
218
+ hit_rate = (_stats["hits"] / total * 100) if total else 0.0
219
+ return {
220
+ "hits": _stats["hits"],
221
+ "misses": _stats["misses"],
222
+ "evictions": _stats["evictions"],
223
+ "size": len(_cache),
224
+ "hit_rate": round(hit_rate, 1),
225
+ "max_size": _MAX_CACHE_ENTRIES,
226
+ }
227
+
228
+
229
+ def set_max_cache_size(n: int) -> None:
230
+ """
231
+ .. versionadded:: 2.2.0
232
+
233
+ Set the maximum number of cached entries allowed.
234
+
235
+ If the cache exceeds this size, older entries will be evicted
236
+ automatically.
237
+
238
+ :param n: New maximum cache size. Must be greater than 0.
239
+ :type n: int
240
+
241
+ :raise ValueError: If ``n`` is less than or equal to 0.
242
+ """
243
+ global _MAX_CACHE_ENTRIES
244
+ if n <= 0:
245
+ raise ValueError("max cache size must be > 0")
246
+ _MAX_CACHE_ENTRIES = int(n)
247
+ with _cache_lock:
248
+ _evict_if_full()
@@ -4,7 +4,7 @@ requires = [ "hatchling" ]
4
4
 
5
5
  [project]
6
6
  name = "fast-csv-loader"
7
- version = "2.1.0"
7
+ version = "2.2.0"
8
8
  description = "A fast and memory efficient way to load large CSV files (Timeseries data) into Pandas"
9
9
  readme = "README.md"
10
10
  keywords = [ "csv-loader", "csv-reader", "memory-efficient", "pandas-dataframe", "python3" ]
@@ -1 +0,0 @@
1
- from fast_csv_loader.csv_loader import csv_loader
File without changes