fast-csv-loader 2.1.0__tar.gz → 2.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fast-csv-loader
3
- Version: 2.1.0
3
+ Version: 2.2.1
4
4
  Summary: A fast and memory efficient way to load large CSV files (Timeseries data) into Pandas
5
5
  Project-URL: Bug Tracker, https://github.com/BennyThadikaran/fast_csv_loader/issues
6
6
  Project-URL: Homepage, https://github.com/BennyThadikaran/fast_csv_loader
@@ -43,6 +43,11 @@ It also improves program execution time, when iterating or loading a large numbe
43
43
 
44
44
  **Supports Python >= 3.8**
45
45
 
46
+ > **Note (v2.2.0):** This release introduces `cached_csv_loader`, an optional drop-in caching layer for `csv_loader` that significantly improves performance for repeated file reads. Existing behavior remains unchanged. Users are encouraged to review the updated documentation for details on cache behavior, invalidation, and configuration options.
47
+ >
48
+ > This feature was contributed by GitHub user **sai2311-eng**.
49
+
50
+
46
51
  ## Install
47
52
 
48
53
  `pip install fast-csv-loader`
@@ -51,6 +56,45 @@ It also improves program execution time, when iterating or loading a large numbe
51
56
 
52
57
  [https://bennythadikaran.github.io/fast_csv_loader/](https://bennythadikaran.github.io/fast_csv_loader/)
53
58
 
59
+ ## Cached Loader (mtime-aware)
60
+
61
+ For workloads where the same files are read repeatedly — scanners looping
62
+ over symbol CSVs, dashboards re-rendering, rolling backtests — use
63
+ `cached_csv_loader`. It wraps `csv_loader` with an in-memory cache that
64
+ automatically invalidates when the file's modification time changes.
65
+
66
+ ```python
67
+ from fast_csv_loader import cached_csv_loader, cache_stats, invalidate_all
68
+ from pathlib import Path
69
+
70
+ # First call: reads from disk
71
+ df = cached_csv_loader(Path("AAPL.csv"), period=200)
72
+
73
+ # Subsequent calls on same file: served from cache (O(1))
74
+ df = cached_csv_loader(Path("AAPL.csv"), period=200)
75
+
76
+ # After your EOD job writes new data, the next call auto-invalidates
77
+ # (mtime changed on disk). For explicit control:
78
+ from fast_csv_loader import invalidate
79
+ invalidate("AAPL.csv") # drop one file
80
+ invalidate_all() # drop everything
81
+
82
+ # Observability
83
+ print(cache_stats())
84
+ # {'hits': 49, 'misses': 1, 'evictions': 0, 'size': 1, 'hit_rate': 98.0, 'max_size': 500}
85
+ ```
86
+
87
+ Benchmark on 133 small daily CSVs (~12 KB each), 5 repeat passes:
88
+
89
+ ```
90
+ csv_loader (no cache): ~555 ms
91
+ cached_csv_loader (warm): ~13 ms (~43x faster)
92
+ ```
93
+
94
+ The cache is process-local and thread-safe. Entries are evicted in
95
+ insertion order when the cache exceeds `max_size` (default 500). Adjust
96
+ with `set_max_cache_size(n)`.
97
+
54
98
  ## Performance
55
99
 
56
100
  Loading a portion of a large file is significantly faster than loading the entire file in memory.
@@ -14,6 +14,11 @@ It also improves program execution time, when iterating or loading a large numbe
14
14
 
15
15
  **Supports Python >= 3.8**
16
16
 
17
+ > **Note (v2.2.0):** This release introduces `cached_csv_loader`, an optional drop-in caching layer for `csv_loader` that significantly improves performance for repeated file reads. Existing behavior remains unchanged. Users are encouraged to review the updated documentation for details on cache behavior, invalidation, and configuration options.
18
+ >
19
+ > This feature was contributed by GitHub user **sai2311-eng**.
20
+
21
+
17
22
  ## Install
18
23
 
19
24
  `pip install fast-csv-loader`
@@ -22,6 +27,45 @@ It also improves program execution time, when iterating or loading a large numbe
22
27
 
23
28
  [https://bennythadikaran.github.io/fast_csv_loader/](https://bennythadikaran.github.io/fast_csv_loader/)
24
29
 
30
+ ## Cached Loader (mtime-aware)
31
+
32
+ For workloads where the same files are read repeatedly — scanners looping
33
+ over symbol CSVs, dashboards re-rendering, rolling backtests — use
34
+ `cached_csv_loader`. It wraps `csv_loader` with an in-memory cache that
35
+ automatically invalidates when the file's modification time changes.
36
+
37
+ ```python
38
+ from fast_csv_loader import cached_csv_loader, cache_stats, invalidate_all
39
+ from pathlib import Path
40
+
41
+ # First call: reads from disk
42
+ df = cached_csv_loader(Path("AAPL.csv"), period=200)
43
+
44
+ # Subsequent calls on same file: served from cache (O(1))
45
+ df = cached_csv_loader(Path("AAPL.csv"), period=200)
46
+
47
+ # After your EOD job writes new data, the next call auto-invalidates
48
+ # (mtime changed on disk). For explicit control:
49
+ from fast_csv_loader import invalidate
50
+ invalidate("AAPL.csv") # drop one file
51
+ invalidate_all() # drop everything
52
+
53
+ # Observability
54
+ print(cache_stats())
55
+ # {'hits': 49, 'misses': 1, 'evictions': 0, 'size': 1, 'hit_rate': 98.0, 'max_size': 500}
56
+ ```
57
+
58
+ Benchmark on 133 small daily CSVs (~12 KB each), 5 repeat passes:
59
+
60
+ ```
61
+ csv_loader (no cache): ~555 ms
62
+ cached_csv_loader (warm): ~13 ms (~43x faster)
63
+ ```
64
+
65
+ The cache is process-local and thread-safe. Entries are evicted in
66
+ insertion order when the cache exceeds `max_size` (default 500). Adjust
67
+ with `set_max_cache_size(n)`.
68
+
25
69
  ## Performance
26
70
 
27
71
  Loading a portion of a large file is significantly faster than loading the entire file in memory.
@@ -0,0 +1,17 @@
1
+ from fast_csv_loader.csv_loader import csv_loader
2
+ from fast_csv_loader.cached_loader import (
3
+ cached_csv_loader,
4
+ invalidate,
5
+ invalidate_all,
6
+ cache_stats,
7
+ set_max_cache_size,
8
+ )
9
+
10
+ __all__ = [
11
+ "csv_loader",
12
+ "cached_csv_loader",
13
+ "invalidate",
14
+ "invalidate_all",
15
+ "cache_stats",
16
+ "set_max_cache_size",
17
+ ]
@@ -0,0 +1,249 @@
1
+ """
2
+ cached_loader — mtime-aware in-memory caching wrapper around csv_loader.
3
+
4
+ When the same CSV file is read repeatedly (e.g. in a scanner loop, a
5
+ rolling backtest, or a dashboard re-render), re-parsing from disk every
6
+ time is wasteful. This module caches the parsed DataFrame and invalidates
7
+ it automatically when the file's modification time changes.
8
+
9
+ Typical use case: a trading scanner loops 50–200 stock CSVs every
10
+ few minutes, the files only change once a day after EOD sync. Without
11
+ caching, every loop re-parses every file.
12
+
13
+ Benchmark on 133 small daily CSVs (~12 KB each), 5 repeat passes:
14
+ csv_loader (no cache): ~555 ms
15
+ cached_csv_loader (warm): ~13 ms (~43x faster)
16
+
17
+ Usage:
18
+ from fast_csv_loader import cached_csv_loader
19
+
20
+ df = cached_csv_loader(Path("AAPL.csv"), period=200) # first call: disk
21
+ df = cached_csv_loader(Path("AAPL.csv"), period=200) # second call: cached
22
+
23
+ # After writing new data to the file, cache auto-invalidates on next read
24
+ # because the file mtime changed. For explicit invalidation:
25
+ from fast_csv_loader import invalidate, invalidate_all, cache_stats
26
+ invalidate("AAPL.csv")
27
+ invalidate_all()
28
+ stats = cache_stats()
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import threading
34
+ from datetime import datetime
35
+ from pathlib import Path
36
+ from typing import Optional
37
+ from collections.abc import Sequence
38
+
39
+ import pandas as pd
40
+
41
+ from fast_csv_loader.csv_loader import csv_loader
42
+
43
+ # Internal cache: absolute_path_str -> (mtime, dataframe)
44
+ _cache: dict = {}
45
+ _cache_lock = threading.Lock()
46
+ _stats = {"hits": 0, "misses": 0, "evictions": 0}
47
+
48
+ # Cap cache size to avoid unbounded memory growth in long-running processes.
49
+ # Entries evicted in insertion order (rough LRU — cheaper than a true LRU).
50
+ _MAX_CACHE_ENTRIES = 500
51
+
52
+
53
+ def _evict_if_full() -> None:
54
+ if len(_cache) > _MAX_CACHE_ENTRIES:
55
+ drop_count = max(1, _MAX_CACHE_ENTRIES // 5)
56
+ for k in list(_cache.keys())[:drop_count]:
57
+ _cache.pop(k, None)
58
+ _stats["evictions"] += 1
59
+
60
+
61
+ def cached_csv_loader(
62
+ file_path: Path,
63
+ period: int = 160,
64
+ end_date: Optional[datetime] = None,
65
+ date_format: Optional[str] = None,
66
+ use_columns: Optional[Sequence[str]] = None,
67
+ chunk_size: int = 1024 * 6,
68
+ ) -> pd.DataFrame:
69
+ """
70
+ .. versionadded:: 2.2.0
71
+
72
+ Mtime-aware cached wrapper around ``csv_loader``.
73
+
74
+ Provides a drop-in replacement for ``csv_loader`` for cases where the
75
+ same CSV file may be read multiple times within the same process. Results
76
+ are cached based on file path, modification time, and selected query
77
+ parameters.
78
+
79
+ The cache key is composed of (file_path, end_date, date_format,
80
+ use_columns). The ``period`` parameter is NOT part of the cache key and
81
+ is applied after cache retrieval, meaning different ``period`` values
82
+ reuse the same cached DataFrame and only affect the returned slice.
83
+
84
+ If the underlying file has changed (based on mtime), the cache entry is
85
+ invalidated and the file is reloaded.
86
+
87
+ :param file_path: The path to the CSV file to be loaded.
88
+ :type file_path: pathlib.Path
89
+
90
+ :param period: Number of rows/candles to return from the end of the
91
+ dataset. Default is 160.
92
+ :type period: int
93
+
94
+ :param end_date: Load data up to this timestamp. If None, the most
95
+ recent data is used. If provided, loading is anchored to this date.
96
+ :type end_date: Optional[datetime]
97
+
98
+ :param date_format: Custom datetime format string used for parsing the
99
+ CSV date column if automatic parsing fails.
100
+ :type date_format: Optional[str]
101
+
102
+ :param use_columns: Default None. A sequence (e.g., list or tuple) of column names to load
103
+ from the CSV file. If None, all columns are loaded.
104
+ :type use_columns: Optional[Sequence[str]]
105
+
106
+ :param chunk_size: Size of chunks (in bytes) used when reading the CSV
107
+ file. Default is 6144 bytes (6 KB).
108
+ :type chunk_size: int
109
+
110
+ :return: A DataFrame containing the requested slice of timeseries data.
111
+ :rtype: pd.DataFrame
112
+
113
+ :raise FileNotFoundError: If ``file_path`` does not exist.
114
+ """
115
+ if not file_path.exists():
116
+ raise FileNotFoundError(f"No such file or directory: '{file_path}'")
117
+
118
+ try:
119
+ mtime = file_path.stat().st_mtime
120
+ except OSError:
121
+ return pd.DataFrame()
122
+
123
+ use_columns_key = tuple(use_columns) if use_columns else None
124
+ cache_key = (str(file_path.resolve()), end_date, date_format, use_columns_key)
125
+
126
+ with _cache_lock:
127
+ entry = _cache.get(cache_key)
128
+ if entry and entry[0] == mtime:
129
+ _stats["hits"] += 1
130
+ df = entry[1]
131
+ return (
132
+ df.iloc[-period:].copy() if period and len(df) > period else df.copy()
133
+ )
134
+
135
+ # Cache miss — load with enough history that later calls with larger
136
+ # `period` values can still be served from cache. We load a generous
137
+ # buffer by passing period * 4 (min 1000) to the underlying loader.
138
+ load_period = max(period * 4, 1000) if period else 10_000
139
+ with _cache_lock:
140
+ _stats["misses"] += 1
141
+ df = csv_loader(
142
+ file_path,
143
+ period=load_period,
144
+ end_date=end_date,
145
+ date_format=date_format,
146
+ use_columns=use_columns,
147
+ chunk_size=chunk_size,
148
+ )
149
+
150
+ with _cache_lock:
151
+ _cache[cache_key] = (mtime, df)
152
+ _evict_if_full()
153
+
154
+ return df.iloc[-period:].copy() if period and len(df) > period else df.copy()
155
+
156
+
157
+ def invalidate(file_path) -> int:
158
+ """
159
+ .. versionadded:: 2.2.0
160
+
161
+ Drop all cache entries for a given file (any ``end_date`` / columns combination).
162
+
163
+ Useful after writing new data to disk when you want to ensure subsequent
164
+ reads do not return stale cached results. Otherwise, cache entries are
165
+ invalidated automatically based on file modification time.
166
+
167
+ :param file_path: Path of the file whose cache entries should be removed.
168
+ :type file_path: pathlib.Path | str
169
+
170
+ :return: Number of cache entries removed for the given file.
171
+ :rtype: int
172
+ """
173
+ target = str(Path(file_path).resolve())
174
+ with _cache_lock:
175
+ keys = [k for k in _cache if k[0] == target]
176
+ for k in keys:
177
+ _cache.pop(k, None)
178
+ return len(keys)
179
+
180
+
181
+ def invalidate_all() -> int:
182
+ """
183
+ .. versionadded:: 2.2.0
184
+
185
+ Drop all entries from the cache.
186
+
187
+ Useful for resetting cache state entirely, for example during testing or
188
+ after bulk data updates.
189
+
190
+ :return: Number of cache entries removed.
191
+ :rtype: int
192
+ """
193
+ with _cache_lock:
194
+ n = len(_cache)
195
+ _cache.clear()
196
+ return n
197
+
198
+
199
+ def cache_stats() -> dict:
200
+ """
201
+ .. versionadded:: 2.2.0
202
+
203
+ Return cache observability metrics including hit/miss counts, current
204
+ cache size, and hit rate.
205
+
206
+ :return: Dictionary containing cache statistics:
207
+
208
+ - ``hits``: Number of cache hits
209
+ - ``misses``: Number of cache misses
210
+ - ``evictions``: Number of evicted entries
211
+ - ``size``: Current number of cached entries
212
+ - ``hit_rate``: Cache hit rate as a percentage (rounded to 1 decimal)
213
+ - ``max_size``: Maximum allowed cache size
214
+
215
+ :rtype: dict
216
+ """
217
+ with _cache_lock:
218
+ total = _stats["hits"] + _stats["misses"]
219
+ hit_rate = (_stats["hits"] / total * 100) if total else 0.0
220
+ return {
221
+ "hits": _stats["hits"],
222
+ "misses": _stats["misses"],
223
+ "evictions": _stats["evictions"],
224
+ "size": len(_cache),
225
+ "hit_rate": round(hit_rate, 1),
226
+ "max_size": _MAX_CACHE_ENTRIES,
227
+ }
228
+
229
+
230
+ def set_max_cache_size(n: int) -> None:
231
+ """
232
+ .. versionadded:: 2.2.0
233
+
234
+ Set the maximum number of cached entries allowed.
235
+
236
+ If the cache exceeds this size, older entries will be evicted
237
+ automatically.
238
+
239
+ :param n: New maximum cache size. Must be greater than 0.
240
+ :type n: int
241
+
242
+ :raise ValueError: If ``n`` is less than or equal to 0.
243
+ """
244
+ global _MAX_CACHE_ENTRIES
245
+ if n <= 0:
246
+ raise ValueError("max cache size must be > 0")
247
+ _MAX_CACHE_ENTRIES = int(n)
248
+ with _cache_lock:
249
+ _evict_if_full()
@@ -2,8 +2,8 @@ import io
2
2
  import os
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
- from typing import Optional, List
6
-
5
+ from typing import Optional
6
+ from collections.abc import Sequence
7
7
  import pandas as pd
8
8
 
9
9
 
@@ -12,7 +12,7 @@ def csv_loader(
12
12
  period: int = 160,
13
13
  end_date: Optional[datetime] = None,
14
14
  date_format: Optional[str] = None,
15
- use_columns: Optional[List[str]] = None,
15
+ use_columns: Optional[Sequence[str]] = None,
16
16
  chunk_size: int = 1024 * 6,
17
17
  ) -> pd.DataFrame:
18
18
  """
@@ -35,8 +35,9 @@ def csv_loader(
35
35
  :param date_format: Custom date format in case pandas is unable to parse the date column.
36
36
  :type date_format: Optional[str]
37
37
 
38
- :param use_columns: Default None. List of column names to load from the CSV file. If None, all columns are loaded.
39
- :type use_columns: Optional[List[str]]
38
+ :param use_columns: Default None. A sequence (e.g., list or tuple) of column names to load
39
+ from the CSV file. If None, all columns are loaded.
40
+ :type use_columns: Optional[Sequence[str]]
40
41
 
41
42
  :param chunk_size: The size of data chunks loaded into memory.
42
43
  The default is 6144 bytes (6 KB).
@@ -4,7 +4,7 @@ requires = [ "hatchling" ]
4
4
 
5
5
  [project]
6
6
  name = "fast-csv-loader"
7
- version = "2.1.0"
7
+ version = "2.2.1"
8
8
  description = "A fast and memory efficient way to load large CSV files (Timeseries data) into Pandas"
9
9
  readme = "README.md"
10
10
  keywords = [ "csv-loader", "csv-reader", "memory-efficient", "pandas-dataframe", "python3" ]
@@ -33,5 +33,5 @@ urls."Bug Tracker" = "https://github.com/BennyThadikaran/fast_csv_loader/issues"
33
33
  urls.Homepage = "https://github.com/BennyThadikaran/fast_csv_loader"
34
34
 
35
35
  [tool.hatch]
36
- build.targets.sdist.exclude = [ "docs", "tests", ".github" ]
37
- build.targets.wheel.exclude = [ "docs", "tests", ".github" ]
36
+ build.targets.wheel.exclude = [ ".github", "docs", "tests" ]
37
+ build.targets.sdist.exclude = [ ".github", "docs", "tests" ]
@@ -0,0 +1,4 @@
1
+ pandas==2.0.3; python_version == "3.8"
2
+ pandas==2.2.2; python_version > "3.8" and python_version < "3.13"
3
+ pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
4
+ pandas==2.3.3; python_version >= "3.14"
@@ -1 +0,0 @@
1
- from fast_csv_loader.csv_loader import csv_loader
@@ -1 +0,0 @@
1
- pandas >= 2, <3
File without changes