cqla 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cqla-0.1.0/PKG-INFO +362 -0
- cqla-0.1.0/README.md +342 -0
- cqla-0.1.0/pyproject.toml +39 -0
- cqla-0.1.0/src/cqla/__init__.py +6 -0
- cqla-0.1.0/src/cqla/accessors/__init__.py +7 -0
- cqla-0.1.0/src/cqla/accessors/base.py +27 -0
- cqla-0.1.0/src/cqla/accessors/dt.py +193 -0
- cqla-0.1.0/src/cqla/accessors/list.py +38 -0
- cqla-0.1.0/src/cqla/accessors/set.py +87 -0
- cqla-0.1.0/src/cqla/accessors/string.py +62 -0
- cqla-0.1.0/src/cqla/accessors/struct.py +93 -0
- cqla-0.1.0/src/cqla/delegates.py +197 -0
- cqla-0.1.0/src/cqla/exprs/__init__.py +16 -0
- cqla-0.1.0/src/cqla/exprs/base.py +252 -0
- cqla-0.1.0/src/cqla/exprs/mixins.py +91 -0
- cqla-0.1.0/src/cqla/exprs/when.py +89 -0
- cqla-0.1.0/src/cqla/query.py +267 -0
cqla-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cqla
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LINQ-inspired query library for Python collections with Polars-style syntax
|
|
5
|
+
Keywords: query,linq,polars,filter,collections,dataframe,sql
|
|
6
|
+
Author: Ahmed Muhammad
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
+
Classifier: Typing :: Typed
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Project-URL: Homepage, https://github.com/ahmedmuhammad/cqla
|
|
17
|
+
Project-URL: Repository, https://github.com/ahmedmuhammad/cqla
|
|
18
|
+
Project-URL: Documentation, https://github.com/ahmedmuhammad/cqla#readme
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# Config Query Language API (cqla)
|
|
22
|
+
|
|
23
|
+
A query language for Python collections, inspired by LINQ with Polars-style syntax.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install cqla
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## The Problem
|
|
32
|
+
|
|
33
|
+
If you've worked with Pydantic models or dataclasses, you've probably written methods inside these classes, like this:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
@dataclass
|
|
37
|
+
class Config:
|
|
38
|
+
name: str
|
|
39
|
+
value: str
|
|
40
|
+
enabled: bool
|
|
41
|
+
priority: int
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ConfigStore:
|
|
45
|
+
configurations: list[Config]
|
|
46
|
+
|
|
47
|
+
def search_by_name(self, name: str) -> Config | None:
|
|
48
|
+
for cfg in self.configurations:
|
|
49
|
+
if cfg.name == name:
|
|
50
|
+
return cfg
|
|
51
|
+
|
|
52
|
+
def get_enabled(self) -> list[Config]:
|
|
53
|
+
return [cfg for cfg in self.configurations if cfg.enabled]
|
|
54
|
+
|
|
55
|
+
def get_high_priority(self, threshold: int) -> list[Config]:
|
|
56
|
+
return [cfg for cfg in self.configurations if cfg.priority > threshold]
|
|
57
|
+
|
|
58
|
+
def get_enabled_high_priority(self, threshold: int) -> list[Config]:
|
|
59
|
+
return [
|
|
60
|
+
cfg for cfg in self.configurations
|
|
61
|
+
if cfg.enabled and cfg.priority > threshold
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
# ...and so on, a new method for every query pattern
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
This gets tedious. Every new query requirement means another method. The logic is scattered, repetitive, and hard to compose.
|
|
68
|
+
|
|
69
|
+
With cqla, you don't need any of those methods:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import cqla as cq
|
|
73
|
+
|
|
74
|
+
configs = [...] # list of Config objects
|
|
75
|
+
|
|
76
|
+
# Search by name
|
|
77
|
+
cq.Query(configs).filter(cq.field("name") == "database_url").first()
|
|
78
|
+
|
|
79
|
+
# Get enabled configs
|
|
80
|
+
cq.Query(configs).filter(cq.field("enabled") == True).collect()
|
|
81
|
+
|
|
82
|
+
# High priority enabled configs, sorted
|
|
83
|
+
(cq.Query(configs)
|
|
84
|
+
.filter((cq.field("enabled") == True) & (cq.field("priority") > 5))
|
|
85
|
+
.collect())
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Works With
|
|
89
|
+
|
|
90
|
+
cqla works with any Python objects:
|
|
91
|
+
|
|
92
|
+
- Plain dicts (JSON-like data)
|
|
93
|
+
- dataclasses
|
|
94
|
+
- Pydantic models
|
|
95
|
+
- msgspec Structs
|
|
96
|
+
- Any object with attributes
|
|
97
|
+
|
|
98
|
+
## Inspiration: LINQ and Polars had a Baby
|
|
99
|
+
|
|
100
|
+
cqla is inspired by [LINQ](https://learn.microsoft.com/en-us/dotnet/csharp/linq/) (Language Integrated Query) from C#/.NET. LINQ lets you query collections using a SQL-like, composable syntax:
|
|
101
|
+
|
|
102
|
+
```csharp
|
|
103
|
+
// C# LINQ
|
|
104
|
+
var results = configs
|
|
105
|
+
.Where(c => c.Enabled && c.Priority > 5)
|
|
106
|
+
.Select(c => new { c.Name, c.Value })
|
|
107
|
+
.OrderBy(c => c.Name);
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
cqla brings this same idea to Python, but with syntax borrowed from Polars:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
# cqla (Python)
|
|
114
|
+
results = (
|
|
115
|
+
cq.Query(configs)
|
|
116
|
+
.filter((cq.field("enabled") == True) & (cq.field("priority") > 5))
|
|
117
|
+
.select("name", "value")
|
|
118
|
+
.collect()
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Alternatives?
|
|
123
|
+
|
|
124
|
+
Libraries like [pydash](https://pydash.readthedocs.io/) and [toolz](https://toolz.readthedocs.io/) are excellent for functional programming patterns:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# pydash
|
|
128
|
+
import pydash as _
|
|
129
|
+
|
|
130
|
+
_.filter_(configs, lambda c: c.enabled and c.priority > 5)
|
|
131
|
+
_.map_(configs, lambda c: c.name.upper())
|
|
132
|
+
_.group_by(configs, "category")
|
|
133
|
+
|
|
134
|
+
# toolz
|
|
135
|
+
from toolz import filter, map, groupby
|
|
136
|
+
from toolz.curried import pipe
|
|
137
|
+
|
|
138
|
+
list(filter(lambda c: c.enabled, configs))
|
|
139
|
+
list(map(lambda c: c.name.upper(), configs))
|
|
140
|
+
groupby(lambda c: c.category, configs)
|
|
141
|
+
|
|
142
|
+
# composing operations in toolz
|
|
143
|
+
pipe(configs,
|
|
144
|
+
lambda x: filter(lambda c: c.enabled, x),
|
|
145
|
+
lambda x: map(lambda c: {"name": c.name.upper(), "priority": c.priority}, x),
|
|
146
|
+
list)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
These work, but if you think in SQL, they feel inside-out. The data comes last, the operations are functions you wrap around things, and composing multiple operations requires nesting or piping.
|
|
150
|
+
|
|
151
|
+
cqla reads like SQL, and when you need custom transformations, `.apply()` lets you drop into a lambda:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
# cqla - reads top to bottom, left to right
|
|
155
|
+
(
|
|
156
|
+
cq.Query(configs)
|
|
157
|
+
.filter(cq.field("enabled") == True) # WHERE enabled = true
|
|
158
|
+
.filter(cq.field("priority") > 5) # AND priority > 5
|
|
159
|
+
.group_by("category") # GROUP BY category
|
|
160
|
+
.having(cq.field("priority").count() > 2) # HAVING COUNT(priority) > 2
|
|
161
|
+
.agg( # SELECT ...
|
|
162
|
+
count=cq.field("name").count(),
|
|
163
|
+
avg_priority=cq.field("priority").mean(),
|
|
164
|
+
)
|
|
165
|
+
.collect()
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# apply() for custom transformations
|
|
169
|
+
(
|
|
170
|
+
cq.Query(configs)
|
|
171
|
+
.filter(cq.field("enabled") == True)
|
|
172
|
+
.select(
|
|
173
|
+
"name",
|
|
174
|
+
name_upper=cq.field("name").apply(str.upper),
|
|
175
|
+
slug=cq.field("name").apply(lambda s: s.lower().replace(" ", "-")),
|
|
176
|
+
)
|
|
177
|
+
.collect()
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Why Not Polars or Pandas?
|
|
182
|
+
|
|
183
|
+
Polars and Pandas are built for tabular data — rows and columns, where every row has the same schema. They're optimized for numerical computation on large datasets.
|
|
184
|
+
|
|
185
|
+
But configuration data, API responses, and domain objects are often semi-structured or nested:
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
configs = [
|
|
189
|
+
{
|
|
190
|
+
"name": "app",
|
|
191
|
+
"settings": {
|
|
192
|
+
"database": {"host": "localhost", "port": 5432},
|
|
193
|
+
"features": ["auth", "logging", "metrics"],
|
|
194
|
+
},
|
|
195
|
+
"metadata": {"version": 1, "tags": ["production"]},
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"name": "worker",
|
|
199
|
+
"settings": {
|
|
200
|
+
"queue": "redis://localhost",
|
|
201
|
+
# no "database" key here
|
|
202
|
+
},
|
|
203
|
+
"metadata": {"version": 2}, # no "tags" key
|
|
204
|
+
},
|
|
205
|
+
]
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Try loading this into Pandas:
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
import pandas as pd
|
|
212
|
+
|
|
213
|
+
df = pd.DataFrame(configs)
|
|
214
|
+
print(df)
|
|
215
|
+
# name settings metadata
|
|
216
|
+
# 0 app {'database': {'host': 'localhost', 'port': 54... {'version': 1, 'tags': ['production']}
|
|
217
|
+
# 1 worker {'queue': 'redis://localhost'} {'version': 2}
|
|
218
|
+
|
|
219
|
+
# Want to filter by database host? Good luck.
|
|
220
|
+
df[df["settings"].apply(lambda s: s.get("database", {}).get("host")) == "localhost"]
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
The nested dicts stay as opaque objects. You're back to writing lambdas and `.apply()`.
|
|
224
|
+
|
|
225
|
+
Polars has the same issue:
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
import polars as pl
|
|
229
|
+
|
|
230
|
+
df = pl.DataFrame(configs)
|
|
231
|
+
# polars.exceptions.SchemaError:
|
|
232
|
+
# could not append value: {"database": {"host": "localhost" ...
|
|
233
|
+
# struct fields must have a consistent schema
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Polars won't even load it because the schemas don't match.
|
|
237
|
+
|
|
238
|
+
cqla handles this naturally:
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
import cqla as cq
|
|
242
|
+
|
|
243
|
+
# Filter by nested field
|
|
244
|
+
(
|
|
245
|
+
cq.Query(configs)
|
|
246
|
+
.filter(cq.field("settings.database.host") == "localhost")
|
|
247
|
+
.collect()
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Access nested fields in select
|
|
251
|
+
(
|
|
252
|
+
cq.Query(configs)
|
|
253
|
+
.select(
|
|
254
|
+
"name",
|
|
255
|
+
db_host=cq.field("settings.database.host"),
|
|
256
|
+
version=cq.field("metadata.version"),
|
|
257
|
+
)
|
|
258
|
+
.collect()
|
|
259
|
+
)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## Features
|
|
263
|
+
|
|
264
|
+
cqla supports the operations you'd expect from a query language:
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
import cqla as cq
|
|
268
|
+
|
|
269
|
+
data = [...] # list of dicts, dataclasses, Pydantic models, or any objects
|
|
270
|
+
|
|
271
|
+
# Filtering
|
|
272
|
+
cq.Query(data).filter(cq.field("age") > 30).collect()
|
|
273
|
+
cq.Query(data).filter((cq.field("age") > 30) & (cq.field("active") == True)).collect()
|
|
274
|
+
|
|
275
|
+
# Selecting fields
|
|
276
|
+
cq.Query(data).select("name", "email").collect()
|
|
277
|
+
cq.Query(data).select("name", uppercased=cq.field("name").str.to_uppercase()).collect()
|
|
278
|
+
|
|
279
|
+
# Adding computed columns
|
|
280
|
+
cq.Query(data).with_columns(
|
|
281
|
+
year=cq.field("created_at").dt.year(),
|
|
282
|
+
name_lower=cq.field("name").str.to_lowercase(),
|
|
283
|
+
).collect()
|
|
284
|
+
|
|
285
|
+
# Conditional expressions
|
|
286
|
+
cq.Query(data).select(
|
|
287
|
+
"name",
|
|
288
|
+
tier=cq.when(cq.field("score") >= 90).then("gold")
|
|
289
|
+
.when(cq.field("score") >= 70).then("silver")
|
|
290
|
+
.otherwise("bronze"),
|
|
291
|
+
).collect()
|
|
292
|
+
|
|
293
|
+
# Grouping and aggregation
|
|
294
|
+
cq.Query(data).group_by("department").agg(
|
|
295
|
+
count=cq.field("id").count(),
|
|
296
|
+
avg_salary=cq.field("salary").mean(),
|
|
297
|
+
).collect()
|
|
298
|
+
|
|
299
|
+
# Filtering groups (HAVING)
|
|
300
|
+
cq.Query(data).group_by("department").having(
|
|
301
|
+
cq.field("id").count() >= 5
|
|
302
|
+
).agg(
|
|
303
|
+
count=cq.field("id").count(),
|
|
304
|
+
).collect()
|
|
305
|
+
|
|
306
|
+
# Window functions
|
|
307
|
+
cq.Query(data).with_columns(
|
|
308
|
+
dept_avg=cq.field("salary").mean().over("department"),
|
|
309
|
+
).collect()
|
|
310
|
+
|
|
311
|
+
# Explode: expand list field into multiple rows
|
|
312
|
+
cq.Query(data).explode("tags").collect()
|
|
313
|
+
# [{"name": "alice", "tags": ["a", "b"]}] -> [{"name": "alice", "tags": "a"}, {"name": "alice", "tags": "b"}]
|
|
314
|
+
|
|
315
|
+
# Accessors for strings, lists, sets, datetimes
|
|
316
|
+
cq.field("name").str.contains("smith", literal=True)
|
|
317
|
+
cq.field("tags").list.len()
|
|
318
|
+
cq.field("categories").set.contains("electronics")
|
|
319
|
+
cq.field("created_at").dt.year()
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
## Scalability
|
|
323
|
+
|
|
324
|
+
cqla is built on generators. Operations like `filter`, `select`, and `with_columns` don't materialize the full dataset until you call `.collect()`. This means you can process large datasets without loading everything into memory:
|
|
325
|
+
|
|
326
|
+
```python
|
|
327
|
+
# Process a million records lazily
|
|
328
|
+
query = (
|
|
329
|
+
cq.Query(huge_dataset)
|
|
330
|
+
.filter(cq.field("status") == "active")
|
|
331
|
+
.select("id", "name")
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Only materializes when you iterate or collect
|
|
335
|
+
for record in query:
|
|
336
|
+
process(record)
|
|
337
|
+
|
|
338
|
+
# Or take just the first 10
|
|
339
|
+
query.limit(10).collect()
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## Examples
|
|
343
|
+
|
|
344
|
+
The `examples/` directory contains interactive [marimo](https://marimo.io/) notebooks demonstrating cqla with different data types:
|
|
345
|
+
|
|
346
|
+
- `json_example.py` — querying plain dicts, nested field access
|
|
347
|
+
- `pydantic_example.py` — querying Pydantic models, set operations
|
|
348
|
+
- `msgspec_example.py` — querying msgspec Structs
|
|
349
|
+
- `stress_test.py` — benchmarks with large datasets
|
|
350
|
+
|
|
351
|
+
To run the examples, clone the repo and install development dependencies:
|
|
352
|
+
|
|
353
|
+
```bash
|
|
354
|
+
git clone https://github.com/ahmedmuhammad/cqla.git
|
|
355
|
+
cd cqla
|
|
356
|
+
uv sync
|
|
357
|
+
uv run marimo edit examples/
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
## License
|
|
361
|
+
|
|
362
|
+
MIT
|
cqla-0.1.0/README.md
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
# Config Query Language API (cqla)
|
|
2
|
+
|
|
3
|
+
A query language for Python collections, inspired by LINQ with Polars-style syntax.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install cqla
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## The Problem
|
|
12
|
+
|
|
13
|
+
If you've worked with Pydantic models or dataclasses, you've probably written methods inside these classes, like this:
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
@dataclass
|
|
17
|
+
class Config:
|
|
18
|
+
name: str
|
|
19
|
+
value: str
|
|
20
|
+
enabled: bool
|
|
21
|
+
priority: int
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ConfigStore:
|
|
25
|
+
configurations: list[Config]
|
|
26
|
+
|
|
27
|
+
def search_by_name(self, name: str) -> Config | None:
|
|
28
|
+
for cfg in self.configurations:
|
|
29
|
+
if cfg.name == name:
|
|
30
|
+
return cfg
|
|
31
|
+
|
|
32
|
+
def get_enabled(self) -> list[Config]:
|
|
33
|
+
return [cfg for cfg in self.configurations if cfg.enabled]
|
|
34
|
+
|
|
35
|
+
def get_high_priority(self, threshold: int) -> list[Config]:
|
|
36
|
+
return [cfg for cfg in self.configurations if cfg.priority > threshold]
|
|
37
|
+
|
|
38
|
+
def get_enabled_high_priority(self, threshold: int) -> list[Config]:
|
|
39
|
+
return [
|
|
40
|
+
cfg for cfg in self.configurations
|
|
41
|
+
if cfg.enabled and cfg.priority > threshold
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# ...and so on, a new method for every query pattern
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
This gets tedious. Every new query requirement means another method. The logic is scattered, repetitive, and hard to compose.
|
|
48
|
+
|
|
49
|
+
With cqla, you don't need any of those methods:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
import cqla as cq
|
|
53
|
+
|
|
54
|
+
configs = [...] # list of Config objects
|
|
55
|
+
|
|
56
|
+
# Search by name
|
|
57
|
+
cq.Query(configs).filter(cq.field("name") == "database_url").first()
|
|
58
|
+
|
|
59
|
+
# Get enabled configs
|
|
60
|
+
cq.Query(configs).filter(cq.field("enabled") == True).collect()
|
|
61
|
+
|
|
62
|
+
# High priority enabled configs, sorted
|
|
63
|
+
(cq.Query(configs)
|
|
64
|
+
.filter((cq.field("enabled") == True) & (cq.field("priority") > 5))
|
|
65
|
+
.collect())
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Works With
|
|
69
|
+
|
|
70
|
+
cqla works with any Python objects:
|
|
71
|
+
|
|
72
|
+
- Plain dicts (JSON-like data)
|
|
73
|
+
- dataclasses
|
|
74
|
+
- Pydantic models
|
|
75
|
+
- msgspec Structs
|
|
76
|
+
- Any object with attributes
|
|
77
|
+
|
|
78
|
+
## Inspiration: LINQ and Polars had a Baby
|
|
79
|
+
|
|
80
|
+
cqla is inspired by [LINQ](https://learn.microsoft.com/en-us/dotnet/csharp/linq/) (Language Integrated Query) from C#/.NET. LINQ lets you query collections using a SQL-like, composable syntax:
|
|
81
|
+
|
|
82
|
+
```csharp
|
|
83
|
+
// C# LINQ
|
|
84
|
+
var results = configs
|
|
85
|
+
.Where(c => c.Enabled && c.Priority > 5)
|
|
86
|
+
.Select(c => new { c.Name, c.Value })
|
|
87
|
+
.OrderBy(c => c.Name);
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
cqla brings this same idea to Python, but with syntax borrowed from Polars:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
# cqla (Python)
|
|
94
|
+
results = (
|
|
95
|
+
cq.Query(configs)
|
|
96
|
+
.filter((cq.field("enabled") == True) & (cq.field("priority") > 5))
|
|
97
|
+
.select("name", "value")
|
|
98
|
+
.collect()
|
|
99
|
+
)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Alternatives?
|
|
103
|
+
|
|
104
|
+
Libraries like [pydash](https://pydash.readthedocs.io/) and [toolz](https://toolz.readthedocs.io/) are excellent for functional programming patterns:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# pydash
|
|
108
|
+
import pydash as _
|
|
109
|
+
|
|
110
|
+
_.filter_(configs, lambda c: c.enabled and c.priority > 5)
|
|
111
|
+
_.map_(configs, lambda c: c.name.upper())
|
|
112
|
+
_.group_by(configs, "category")
|
|
113
|
+
|
|
114
|
+
# toolz
|
|
115
|
+
from toolz import filter, map, groupby
|
|
116
|
+
from toolz.curried import pipe
|
|
117
|
+
|
|
118
|
+
list(filter(lambda c: c.enabled, configs))
|
|
119
|
+
list(map(lambda c: c.name.upper(), configs))
|
|
120
|
+
groupby(lambda c: c.category, configs)
|
|
121
|
+
|
|
122
|
+
# composing operations in toolz
|
|
123
|
+
pipe(configs,
|
|
124
|
+
lambda x: filter(lambda c: c.enabled, x),
|
|
125
|
+
lambda x: map(lambda c: {"name": c.name.upper(), "priority": c.priority}, x),
|
|
126
|
+
list)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
These work, but if you think in SQL, they feel inside-out. The data comes last, the operations are functions you wrap around things, and composing multiple operations requires nesting or piping.
|
|
130
|
+
|
|
131
|
+
cqla reads like SQL, and when you need custom transformations, `.apply()` lets you drop into a lambda:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
# cqla - reads top to bottom, left to right
|
|
135
|
+
(
|
|
136
|
+
cq.Query(configs)
|
|
137
|
+
.filter(cq.field("enabled") == True) # WHERE enabled = true
|
|
138
|
+
.filter(cq.field("priority") > 5) # AND priority > 5
|
|
139
|
+
.group_by("category") # GROUP BY category
|
|
140
|
+
.having(cq.field("priority").count() > 2) # HAVING COUNT(priority) > 2
|
|
141
|
+
.agg( # SELECT ...
|
|
142
|
+
count=cq.field("name").count(),
|
|
143
|
+
avg_priority=cq.field("priority").mean(),
|
|
144
|
+
)
|
|
145
|
+
.collect()
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# apply() for custom transformations
|
|
149
|
+
(
|
|
150
|
+
cq.Query(configs)
|
|
151
|
+
.filter(cq.field("enabled") == True)
|
|
152
|
+
.select(
|
|
153
|
+
"name",
|
|
154
|
+
name_upper=cq.field("name").apply(str.upper),
|
|
155
|
+
slug=cq.field("name").apply(lambda s: s.lower().replace(" ", "-")),
|
|
156
|
+
)
|
|
157
|
+
.collect()
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Why Not Polars or Pandas?
|
|
162
|
+
|
|
163
|
+
Polars and Pandas are built for tabular data — rows and columns, where every row has the same schema. They're optimized for numerical computation on large datasets.
|
|
164
|
+
|
|
165
|
+
But configuration data, API responses, and domain objects are often semi-structured or nested:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
configs = [
|
|
169
|
+
{
|
|
170
|
+
"name": "app",
|
|
171
|
+
"settings": {
|
|
172
|
+
"database": {"host": "localhost", "port": 5432},
|
|
173
|
+
"features": ["auth", "logging", "metrics"],
|
|
174
|
+
},
|
|
175
|
+
"metadata": {"version": 1, "tags": ["production"]},
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
"name": "worker",
|
|
179
|
+
"settings": {
|
|
180
|
+
"queue": "redis://localhost",
|
|
181
|
+
# no "database" key here
|
|
182
|
+
},
|
|
183
|
+
"metadata": {"version": 2}, # no "tags" key
|
|
184
|
+
},
|
|
185
|
+
]
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Try loading this into Pandas:
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
import pandas as pd
|
|
192
|
+
|
|
193
|
+
df = pd.DataFrame(configs)
|
|
194
|
+
print(df)
|
|
195
|
+
# name settings metadata
|
|
196
|
+
# 0 app {'database': {'host': 'localhost', 'port': 54... {'version': 1, 'tags': ['production']}
|
|
197
|
+
# 1 worker {'queue': 'redis://localhost'} {'version': 2}
|
|
198
|
+
|
|
199
|
+
# Want to filter by database host? Good luck.
|
|
200
|
+
df[df["settings"].apply(lambda s: s.get("database", {}).get("host")) == "localhost"]
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
The nested dicts stay as opaque objects. You're back to writing lambdas and `.apply()`.
|
|
204
|
+
|
|
205
|
+
Polars has the same issue:
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
import polars as pl
|
|
209
|
+
|
|
210
|
+
df = pl.DataFrame(configs)
|
|
211
|
+
# polars.exceptions.SchemaError:
|
|
212
|
+
# could not append value: {"database": {"host": "localhost" ...
|
|
213
|
+
# struct fields must have a consistent schema
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Polars won't even load it because the schemas don't match.
|
|
217
|
+
|
|
218
|
+
cqla handles this naturally:
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
import cqla as cq
|
|
222
|
+
|
|
223
|
+
# Filter by nested field
|
|
224
|
+
(
|
|
225
|
+
cq.Query(configs)
|
|
226
|
+
.filter(cq.field("settings.database.host") == "localhost")
|
|
227
|
+
.collect()
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Access nested fields in select
|
|
231
|
+
(
|
|
232
|
+
cq.Query(configs)
|
|
233
|
+
.select(
|
|
234
|
+
"name",
|
|
235
|
+
db_host=cq.field("settings.database.host"),
|
|
236
|
+
version=cq.field("metadata.version"),
|
|
237
|
+
)
|
|
238
|
+
.collect()
|
|
239
|
+
)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## Features
|
|
243
|
+
|
|
244
|
+
cqla supports the operations you'd expect from a query language:
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
import cqla as cq
|
|
248
|
+
|
|
249
|
+
data = [...] # list of dicts, dataclasses, Pydantic models, or any objects
|
|
250
|
+
|
|
251
|
+
# Filtering
|
|
252
|
+
cq.Query(data).filter(cq.field("age") > 30).collect()
|
|
253
|
+
cq.Query(data).filter((cq.field("age") > 30) & (cq.field("active") == True)).collect()
|
|
254
|
+
|
|
255
|
+
# Selecting fields
|
|
256
|
+
cq.Query(data).select("name", "email").collect()
|
|
257
|
+
cq.Query(data).select("name", uppercased=cq.field("name").str.to_uppercase()).collect()
|
|
258
|
+
|
|
259
|
+
# Adding computed columns
|
|
260
|
+
cq.Query(data).with_columns(
|
|
261
|
+
year=cq.field("created_at").dt.year(),
|
|
262
|
+
name_lower=cq.field("name").str.to_lowercase(),
|
|
263
|
+
).collect()
|
|
264
|
+
|
|
265
|
+
# Conditional expressions
|
|
266
|
+
cq.Query(data).select(
|
|
267
|
+
"name",
|
|
268
|
+
tier=cq.when(cq.field("score") >= 90).then("gold")
|
|
269
|
+
.when(cq.field("score") >= 70).then("silver")
|
|
270
|
+
.otherwise("bronze"),
|
|
271
|
+
).collect()
|
|
272
|
+
|
|
273
|
+
# Grouping and aggregation
|
|
274
|
+
cq.Query(data).group_by("department").agg(
|
|
275
|
+
count=cq.field("id").count(),
|
|
276
|
+
avg_salary=cq.field("salary").mean(),
|
|
277
|
+
).collect()
|
|
278
|
+
|
|
279
|
+
# Filtering groups (HAVING)
|
|
280
|
+
cq.Query(data).group_by("department").having(
|
|
281
|
+
cq.field("id").count() >= 5
|
|
282
|
+
).agg(
|
|
283
|
+
count=cq.field("id").count(),
|
|
284
|
+
).collect()
|
|
285
|
+
|
|
286
|
+
# Window functions
|
|
287
|
+
cq.Query(data).with_columns(
|
|
288
|
+
dept_avg=cq.field("salary").mean().over("department"),
|
|
289
|
+
).collect()
|
|
290
|
+
|
|
291
|
+
# Explode: expand list field into multiple rows
|
|
292
|
+
cq.Query(data).explode("tags").collect()
|
|
293
|
+
# [{"name": "alice", "tags": ["a", "b"]}] -> [{"name": "alice", "tags": "a"}, {"name": "alice", "tags": "b"}]
|
|
294
|
+
|
|
295
|
+
# Accessors for strings, lists, sets, datetimes
|
|
296
|
+
cq.field("name").str.contains("smith", literal=True)
|
|
297
|
+
cq.field("tags").list.len()
|
|
298
|
+
cq.field("categories").set.contains("electronics")
|
|
299
|
+
cq.field("created_at").dt.year()
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
## Scalability
|
|
303
|
+
|
|
304
|
+
cqla is built on generators. Operations like `filter`, `select`, and `with_columns` don't materialize the full dataset until you call `.collect()`. This means you can process large datasets without loading everything into memory:
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
# Process a million records lazily
|
|
308
|
+
query = (
|
|
309
|
+
cq.Query(huge_dataset)
|
|
310
|
+
.filter(cq.field("status") == "active")
|
|
311
|
+
.select("id", "name")
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Only materializes when you iterate or collect
|
|
315
|
+
for record in query:
|
|
316
|
+
process(record)
|
|
317
|
+
|
|
318
|
+
# Or take just the first 10
|
|
319
|
+
query.limit(10).collect()
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
## Examples
|
|
323
|
+
|
|
324
|
+
The `examples/` directory contains interactive [marimo](https://marimo.io/) notebooks demonstrating cqla with different data types:
|
|
325
|
+
|
|
326
|
+
- `json_example.py` — querying plain dicts, nested field access
|
|
327
|
+
- `pydantic_example.py` — querying Pydantic models, set operations
|
|
328
|
+
- `msgspec_example.py` — querying msgspec Structs
|
|
329
|
+
- `stress_test.py` — benchmarks with large datasets
|
|
330
|
+
|
|
331
|
+
To run the examples, clone the repo and install development dependencies:
|
|
332
|
+
|
|
333
|
+
```bash
|
|
334
|
+
git clone https://github.com/ahmedmuhammad/cqla.git
|
|
335
|
+
cd cqla
|
|
336
|
+
uv sync
|
|
337
|
+
uv run marimo edit examples/
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
## License
|
|
341
|
+
|
|
342
|
+
MIT
|