sarj-python-lint 0.4.1__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/PKG-INFO +6 -1
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/README.md +5 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/pyproject.toml +5 -2
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/__main__.py +11 -4
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rule_base.py +13 -6
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/__init__.py +6 -0
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/_logging.py +35 -0
- sarj_python_lint-0.4.1/src/sarj_python_lint/rules/__init__.py → sarj_python_lint-0.6.0/src/sarj_python_lint/rules/_registry.py +32 -1
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/inefficient_string_concat_in_loop.py +48 -29
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/no_aggregation_in_store_query.py +134 -0
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/no_comment_cruft.py +252 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/no_fat_try_blocks.py +10 -4
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/no_fstring_in_log.py +106 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/no_isinstance_union_chain.py +23 -10
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/no_query_with_many_joins.py +110 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/no_secret_in_log.py +15 -22
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/no_select_star.py +114 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/no_sentinel_return_on_except.py +14 -19
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/no_sequential_await.py +94 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/no_unreachable_after_terminal.py +14 -10
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/prefer_class_row.py +90 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/prefer_constant_time_secret_compare.py +11 -5
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/prefer_discriminated_union.py +41 -23
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/prefer_str_enum.py +13 -7
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/prefer_struct_over_namedtuple.py +100 -0
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/prefer_timedelta_for_durations.py +196 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/rules/pydantic_at_boundaries.py +13 -7
- sarj_python_lint-0.6.0/src/sarj_python_lint/rules/store_insert_requires_on_conflict.py +106 -0
- sarj_python_lint-0.4.1/src/sarj_python_lint/rules/no_sequential_await.py +0 -71
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/.gitignore +0 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/__init__.py +0 -0
- {sarj_python_lint-0.4.1 → sarj_python_lint-0.6.0}/src/sarj_python_lint/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sarj-python-lint
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Custom Python lint rules — AST-based, pre-commit-friendly, hypermodern defaults
|
|
5
5
|
Project-URL: Homepage, https://github.com/sarj-ai/standards/tree/main/packages/python
|
|
6
6
|
Project-URL: Repository, https://github.com/sarj-ai/standards
|
|
@@ -36,6 +36,11 @@ uv tool install sarj-python-lint
|
|
|
36
36
|
- id: sarj-prefer-str-enum
|
|
37
37
|
- id: sarj-no-fat-try-blocks
|
|
38
38
|
- id: sarj-pydantic-at-boundaries
|
|
39
|
+
- id: sarj-prefer-class-row
|
|
40
|
+
- id: sarj-prefer-timedelta-for-durations
|
|
41
|
+
- id: sarj-prefer-struct-over-namedtuple
|
|
42
|
+
- id: sarj-no-comment-cruft
|
|
43
|
+
- id: sarj-no-fstring-in-log
|
|
39
44
|
```
|
|
40
45
|
|
|
41
46
|
## CLI
|
|
@@ -18,6 +18,11 @@ uv tool install sarj-python-lint
|
|
|
18
18
|
- id: sarj-prefer-str-enum
|
|
19
19
|
- id: sarj-no-fat-try-blocks
|
|
20
20
|
- id: sarj-pydantic-at-boundaries
|
|
21
|
+
- id: sarj-prefer-class-row
|
|
22
|
+
- id: sarj-prefer-timedelta-for-durations
|
|
23
|
+
- id: sarj-prefer-struct-over-namedtuple
|
|
24
|
+
- id: sarj-no-comment-cruft
|
|
25
|
+
- id: sarj-no-fstring-in-log
|
|
21
26
|
```
|
|
22
27
|
|
|
23
28
|
## CLI
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "sarj-python-lint"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.6.0"
|
|
4
4
|
description = "Custom Python lint rules — AST-based, pre-commit-friendly, hypermodern defaults"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{ name = "sarj-ai" }]
|
|
@@ -27,7 +27,6 @@ Issues = "https://github.com/sarj-ai/standards/issues"
|
|
|
27
27
|
[dependency-groups]
|
|
28
28
|
dev = [
|
|
29
29
|
"pytest>=9.0",
|
|
30
|
-
"pytest-benchmark>=5.2",
|
|
31
30
|
"ruff>=0.15",
|
|
32
31
|
"basedpyright>=1.39",
|
|
33
32
|
]
|
|
@@ -53,3 +52,7 @@ exclude = [
|
|
|
53
52
|
|
|
54
53
|
[tool.pytest.ini_options]
|
|
55
54
|
testpaths = ["tests"]
|
|
55
|
+
|
|
56
|
+
# Dogfooding: linted/formatted by this repo's own published config (root-synced).
|
|
57
|
+
[tool.ruff]
|
|
58
|
+
extend = "../../.ruff-strict.toml"
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import argparse
|
|
5
|
-
import sys
|
|
6
5
|
from pathlib import Path
|
|
6
|
+
import sys
|
|
7
7
|
|
|
8
8
|
from sarj_python_lint import __version__
|
|
9
9
|
from sarj_python_lint.rule_base import Diagnostic, is_suppressed
|
|
@@ -16,6 +16,10 @@ SKIP_DIR_NAMES = {
|
|
|
16
16
|
".turbo", ".yarn", ".pnpm-store",
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
+
# Skip files larger than this — they are almost always generated/vendored, not
|
|
20
|
+
# hand-written source worth linting.
|
|
21
|
+
_MAX_FILE_BYTES = 500_000
|
|
22
|
+
|
|
19
23
|
|
|
20
24
|
def _expand_paths(paths: list[Path]) -> list[Path]:
|
|
21
25
|
out: list[Path] = []
|
|
@@ -31,7 +35,7 @@ def _expand_paths(paths: list[Path]) -> list[Path]:
|
|
|
31
35
|
if any(part in SKIP_DIR_NAMES for part in child.parts):
|
|
32
36
|
continue
|
|
33
37
|
try:
|
|
34
|
-
if child.stat().st_size >
|
|
38
|
+
if child.stat().st_size > _MAX_FILE_BYTES:
|
|
35
39
|
continue
|
|
36
40
|
except OSError:
|
|
37
41
|
continue
|
|
@@ -82,14 +86,17 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
82
86
|
sub.add_parser("list-rules", help="List available rule IDs.")
|
|
83
87
|
|
|
84
88
|
args = parser.parse_args(argv)
|
|
89
|
+
cmd: str | None = args.cmd
|
|
85
90
|
|
|
86
|
-
if
|
|
91
|
+
if cmd == "list-rules":
|
|
87
92
|
for rid, cls in sorted(REGISTRY.items()):
|
|
88
93
|
inst = cls()
|
|
89
94
|
sys.stdout.write(f"{inst.code:8} {rid:40} {inst.description}\n")
|
|
90
95
|
return 0
|
|
91
96
|
|
|
92
|
-
|
|
97
|
+
rule_ids: list[str] = args.rule
|
|
98
|
+
files: list[Path] = args.files
|
|
99
|
+
diags = _check(rule_ids, files)
|
|
93
100
|
for d in diags:
|
|
94
101
|
sys.stdout.write(d.format() + "\n")
|
|
95
102
|
return 1 if diags else 0
|
|
@@ -2,24 +2,31 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import re
|
|
6
5
|
from abc import ABC, abstractmethod
|
|
7
6
|
from dataclasses import dataclass
|
|
8
|
-
|
|
7
|
+
import re
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Sequence
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
9
15
|
|
|
10
16
|
# Suppression syntax. Two forms supported:
|
|
11
17
|
# # sarj-noqa: SARJ001 — reason
|
|
12
18
|
# # sarj-noqa: SARJ001, SARJ002 — reason
|
|
13
|
-
# We deliberately do NOT
|
|
14
|
-
# unrecognized
|
|
15
|
-
# silently breaks suppressions across runs.
|
|
19
|
+
# We deliberately do NOT reuse ruff's own suppression comment because ruff
|
|
20
|
+
# aggressively cleans unrecognized codes (RUF100/RUF102) even with `external`
|
|
21
|
+
# set, which silently breaks suppressions across runs. A distinct prefix
|
|
22
|
+
# (sarj-noqa) shares no syntax with ruff, so the two never collide.
|
|
16
23
|
_SARJ_NOQA_RE = re.compile(
|
|
17
24
|
r"#\s*sarj-noqa(?::\s*([A-Za-z0-9_, ]+))?",
|
|
18
25
|
re.IGNORECASE,
|
|
19
26
|
)
|
|
20
27
|
|
|
21
28
|
|
|
22
|
-
def is_suppressed(source_lines:
|
|
29
|
+
def is_suppressed(source_lines: Sequence[str], line: int, code: str) -> bool:
|
|
23
30
|
"""Return True if the diagnostic's line carries a `# sarj-noqa[: CODE]` comment.
|
|
24
31
|
|
|
25
32
|
`line` is 1-based to match Diagnostic.line.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Shared logging-receiver detection for SARJ012/SARJ017.
|
|
2
|
+
|
|
3
|
+
A single resolver for "does this receiver expression evaluate to a logger?",
|
|
4
|
+
used by both the secret-in-log and f-string-in-log rules so they recognise the
|
|
5
|
+
same factory/builder forms.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import ast
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_LOGGER_NAMES = frozenset({"logger", "log", "logging", "loguru", "_logger", "_log"})
|
|
14
|
+
|
|
15
|
+
_LOGGER_FACTORIES = frozenset({"getlogger", "getchild"})
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def is_logger_expr(expr: ast.expr) -> bool:
|
|
19
|
+
"""True if `expr` evaluates to a logger.
|
|
20
|
+
|
|
21
|
+
Resolves the whole receiver chain so adapter/builder/factory calls are
|
|
22
|
+
caught: `logger.bind(...).info(...)`, `logger.opt(lazy=True).debug(...)`,
|
|
23
|
+
`logging.getLogger(__name__).info(...)`, `self.logger.error(...)`.
|
|
24
|
+
"""
|
|
25
|
+
if isinstance(expr, ast.Name):
|
|
26
|
+
return expr.id.lower() in _LOGGER_NAMES
|
|
27
|
+
if isinstance(expr, ast.Attribute):
|
|
28
|
+
if expr.attr.lower() in _LOGGER_NAMES or expr.attr.lower() in _LOGGER_FACTORIES:
|
|
29
|
+
return True
|
|
30
|
+
return is_logger_expr(expr.value)
|
|
31
|
+
if isinstance(expr, ast.Call):
|
|
32
|
+
if isinstance(expr.func, ast.Attribute) and expr.func.attr.lower() in _LOGGER_FACTORIES:
|
|
33
|
+
return True
|
|
34
|
+
return is_logger_expr(expr.func)
|
|
35
|
+
return False
|
|
@@ -1,29 +1,52 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
4
5
|
from sarj_python_lint.rules.inefficient_string_concat_in_loop import (
|
|
5
6
|
InefficientStringConcatInLoop,
|
|
6
7
|
)
|
|
8
|
+
from sarj_python_lint.rules.no_aggregation_in_store_query import (
|
|
9
|
+
NoAggregationInStoreQuery,
|
|
10
|
+
)
|
|
11
|
+
from sarj_python_lint.rules.no_comment_cruft import NoCommentCruft
|
|
7
12
|
from sarj_python_lint.rules.no_fat_try_blocks import NoFatTryBlocks
|
|
13
|
+
from sarj_python_lint.rules.no_fstring_in_log import NoFstringInLog
|
|
8
14
|
from sarj_python_lint.rules.no_isinstance_union_chain import NoIsinstanceUnionChain
|
|
15
|
+
from sarj_python_lint.rules.no_query_with_many_joins import NoQueryWithManyJoins
|
|
9
16
|
from sarj_python_lint.rules.no_secret_in_log import NoSecretInLog
|
|
17
|
+
from sarj_python_lint.rules.no_select_star import NoSelectStar
|
|
10
18
|
from sarj_python_lint.rules.no_sentinel_return_on_except import NoSentinelReturnOnExcept
|
|
11
19
|
from sarj_python_lint.rules.no_sequential_await import NoSequentialAwait
|
|
12
20
|
from sarj_python_lint.rules.no_unreachable_after_terminal import (
|
|
13
21
|
NoUnreachableAfterTerminal,
|
|
14
22
|
)
|
|
23
|
+
from sarj_python_lint.rules.prefer_class_row import PreferClassRow
|
|
15
24
|
from sarj_python_lint.rules.prefer_constant_time_secret_compare import (
|
|
16
25
|
PreferConstantTimeSecretCompare,
|
|
17
26
|
)
|
|
18
27
|
from sarj_python_lint.rules.prefer_discriminated_union import PreferDiscriminatedUnion
|
|
19
28
|
from sarj_python_lint.rules.prefer_str_enum import PreferStrEnum
|
|
29
|
+
from sarj_python_lint.rules.prefer_struct_over_namedtuple import (
|
|
30
|
+
PreferStructOverNamedtuple,
|
|
31
|
+
)
|
|
32
|
+
from sarj_python_lint.rules.prefer_timedelta_for_durations import (
|
|
33
|
+
PreferTimedeltaForDurations,
|
|
34
|
+
)
|
|
20
35
|
from sarj_python_lint.rules.pydantic_at_boundaries import PydanticAtBoundaries
|
|
36
|
+
from sarj_python_lint.rules.store_insert_requires_on_conflict import (
|
|
37
|
+
StoreInsertRequiresOnConflict,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
from sarj_python_lint.rule_base import Rule
|
|
21
43
|
|
|
22
44
|
|
|
23
45
|
REGISTRY: dict[str, type[Rule]] = {
|
|
24
46
|
NoSequentialAwait.id: NoSequentialAwait,
|
|
25
47
|
InefficientStringConcatInLoop.id: InefficientStringConcatInLoop,
|
|
26
48
|
PreferDiscriminatedUnion.id: PreferDiscriminatedUnion,
|
|
49
|
+
PreferClassRow.id: PreferClassRow,
|
|
27
50
|
PreferStrEnum.id: PreferStrEnum,
|
|
28
51
|
NoFatTryBlocks.id: NoFatTryBlocks,
|
|
29
52
|
NoIsinstanceUnionChain.id: NoIsinstanceUnionChain,
|
|
@@ -32,6 +55,14 @@ REGISTRY: dict[str, type[Rule]] = {
|
|
|
32
55
|
NoUnreachableAfterTerminal.id: NoUnreachableAfterTerminal,
|
|
33
56
|
PreferConstantTimeSecretCompare.id: PreferConstantTimeSecretCompare,
|
|
34
57
|
NoSecretInLog.id: NoSecretInLog,
|
|
58
|
+
PreferTimedeltaForDurations.id: PreferTimedeltaForDurations,
|
|
59
|
+
PreferStructOverNamedtuple.id: PreferStructOverNamedtuple,
|
|
60
|
+
NoCommentCruft.id: NoCommentCruft,
|
|
61
|
+
NoFstringInLog.id: NoFstringInLog,
|
|
62
|
+
StoreInsertRequiresOnConflict.id: StoreInsertRequiresOnConflict,
|
|
63
|
+
NoQueryWithManyJoins.id: NoQueryWithManyJoins,
|
|
64
|
+
NoAggregationInStoreQuery.id: NoAggregationInStoreQuery,
|
|
65
|
+
NoSelectStar.id: NoSelectStar,
|
|
35
66
|
}
|
|
36
67
|
|
|
37
68
|
__all__ = ["REGISTRY"]
|
|
@@ -12,48 +12,67 @@ References:
|
|
|
12
12
|
from __future__ import annotations
|
|
13
13
|
|
|
14
14
|
import ast
|
|
15
|
-
from
|
|
15
|
+
from typing import TYPE_CHECKING, override
|
|
16
16
|
|
|
17
17
|
from sarj_python_lint.rule_base import Diagnostic, Rule
|
|
18
18
|
|
|
19
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
|
|
20
24
|
class InefficientStringConcatInLoop(Rule):
|
|
21
25
|
"""O(n²) string concatenation in a loop."""
|
|
22
26
|
|
|
23
|
-
id = "inefficient-string-concat-in-loop"
|
|
24
|
-
code = "SARJ002"
|
|
25
|
-
description = "`s += '...'` in a loop is O(n²); append to a list and join."
|
|
27
|
+
id: str = "inefficient-string-concat-in-loop"
|
|
28
|
+
code: str = "SARJ002"
|
|
29
|
+
description: str = "`s += '...'` in a loop is O(n²); append to a list and join."
|
|
26
30
|
|
|
31
|
+
@override
|
|
27
32
|
def check(self, path: Path, source: str) -> list[Diagnostic]:
|
|
28
33
|
try:
|
|
29
34
|
tree = ast.parse(source, filename=str(path))
|
|
30
35
|
except SyntaxError:
|
|
31
36
|
return []
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
37
|
+
visitor = _ConcatVisitor()
|
|
38
|
+
visitor.visit(tree)
|
|
39
|
+
return [
|
|
40
|
+
Diagnostic(
|
|
41
|
+
path=path,
|
|
42
|
+
line=node.lineno,
|
|
43
|
+
col=node.col_offset + 1,
|
|
44
|
+
code=self.code,
|
|
45
|
+
message=(
|
|
46
|
+
"`+=` string concat in a loop is O(n²). "
|
|
47
|
+
"Append to a list and `''.join(...)`."
|
|
48
|
+
),
|
|
49
|
+
)
|
|
50
|
+
for node in visitor.hits
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class _ConcatVisitor(ast.NodeVisitor):
|
|
55
|
+
"""Single O(n) pass flagging each in-loop string `+=` exactly once."""
|
|
56
|
+
|
|
57
|
+
def __init__(self) -> None:
|
|
58
|
+
self._loop_depth: int = 0
|
|
59
|
+
self.hits: list[ast.AugAssign] = []
|
|
60
|
+
|
|
61
|
+
@override
|
|
62
|
+
def generic_visit(self, node: ast.AST) -> None:
|
|
63
|
+
if isinstance(node, (ast.For, ast.While)):
|
|
64
|
+
self._loop_depth += 1
|
|
65
|
+
super().generic_visit(node)
|
|
66
|
+
self._loop_depth -= 1
|
|
67
|
+
return
|
|
68
|
+
if (
|
|
69
|
+
self._loop_depth
|
|
70
|
+
and isinstance(node, ast.AugAssign)
|
|
71
|
+
and isinstance(node.op, ast.Add)
|
|
72
|
+
and _looks_like_string(node.value)
|
|
73
|
+
):
|
|
74
|
+
self.hits.append(node)
|
|
75
|
+
super().generic_visit(node)
|
|
57
76
|
|
|
58
77
|
|
|
59
78
|
def _looks_like_string(node: ast.AST) -> bool:
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""SARJ020: no DISTINCT / GROUP BY / COUNT in a store query — aggregate elsewhere.
|
|
2
|
+
|
|
3
|
+
Heavy aggregation (`COUNT`, `GROUP BY`, `DISTINCT`) does not belong in the
|
|
4
|
+
transactional Postgres store layer: it scans, sorts, and hashes large row sets
|
|
5
|
+
on the primary, competing with the latency-critical OLTP path. The house
|
|
6
|
+
direction is to push aggregate/analytical reads to the columnar mirror
|
|
7
|
+
(ClickHouse / BigQuery), where they are cheap, and keep Postgres queries to
|
|
8
|
+
point lookups and small bounded reads.
|
|
9
|
+
|
|
10
|
+
This rule walks SQL string literals embedded in `.py` (`*_store.py`) and flags
|
|
11
|
+
any query (a string containing `FROM`) that uses `COUNT(`, `GROUP BY`, or
|
|
12
|
+
`DISTINCT`. `--` and `/* */` comments are stripped first.
|
|
13
|
+
|
|
14
|
+
# flagged
|
|
15
|
+
"SELECT status, COUNT(*) FROM call GROUP BY status"
|
|
16
|
+
"SELECT DISTINCT org_id FROM call"
|
|
17
|
+
|
|
18
|
+
# preferred
|
|
19
|
+
point/bounded reads in Postgres; aggregate in ClickHouse/BigQuery.
|
|
20
|
+
|
|
21
|
+
If an aggregate genuinely must run on Postgres (e.g. a tiny bounded admin
|
|
22
|
+
count), suppress with `# sarj-noqa: SARJ020 — <reason>`.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import ast
|
|
28
|
+
import re
|
|
29
|
+
from typing import TYPE_CHECKING, override
|
|
30
|
+
|
|
31
|
+
from sarj_python_lint.rule_base import Diagnostic, Rule
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
_LINE_COMMENT = re.compile(r"--.*?$", re.MULTILINE)
|
|
39
|
+
_BLOCK_COMMENT = re.compile(r"/\*.*?\*/", re.DOTALL)
|
|
40
|
+
# A real SQL query shape — not just the word "from", so prose/LLM-prompt strings
|
|
41
|
+
# (e.g. "distinct from unexpected exceptions") are not mistaken for queries.
|
|
42
|
+
_QUERY_SHAPE = re.compile(
|
|
43
|
+
r"\bSELECT\b[\s\S]*?\bFROM\b|\bUPDATE\b[\s\S]*?\bSET\b|\bDELETE\b\s+FROM\b",
|
|
44
|
+
re.IGNORECASE,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# ClickHouse IS the place for aggregation. A file that talks to ClickHouse (the
|
|
48
|
+
# columnar mirror) is exempt — only Postgres store queries are in scope.
|
|
49
|
+
_CLICKHOUSE_FILE = re.compile(
|
|
50
|
+
r"\bclickhouse_connect\b|\bclickhouse_driver\b|^\s*import\s+clickhouse\b",
|
|
51
|
+
re.MULTILINE,
|
|
52
|
+
)
|
|
53
|
+
# Belt-and-braces: a single query using ClickHouse-only functions is ClickHouse.
|
|
54
|
+
_CLICKHOUSE_SQL = re.compile(
|
|
55
|
+
r"\barg(?:Max|Min)\b|\b_peerdb|\bJSONExtract|\buniqExact\b|\bgroupArray\b"
|
|
56
|
+
r"|\barrayJoin\b|\bquantile\w*\(",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
_AGGREGATIONS: tuple[tuple[str, re.Pattern[str]], ...] = (
|
|
60
|
+
("COUNT(", re.compile(r"\bCOUNT\s*\(", re.IGNORECASE)),
|
|
61
|
+
("GROUP BY", re.compile(r"\bGROUP\s+BY\b", re.IGNORECASE)),
|
|
62
|
+
("DISTINCT", re.compile(r"\bDISTINCT\b", re.IGNORECASE)),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _string_value(node: ast.expr) -> str | None:
|
|
67
|
+
"""Reconstruct a (possibly `+`-concatenated) string literal, else None."""
|
|
68
|
+
if isinstance(node, ast.Constant) and isinstance(node.value, str):
|
|
69
|
+
return node.value
|
|
70
|
+
if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
|
|
71
|
+
left = _string_value(node.left)
|
|
72
|
+
right = _string_value(node.right)
|
|
73
|
+
if left is not None and right is not None:
|
|
74
|
+
return left + right
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _strip_sql_comments(text: str) -> str:
|
|
79
|
+
return _BLOCK_COMMENT.sub(" ", _LINE_COMMENT.sub("", text))
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class NoAggregationInStoreQuery(Rule):
|
|
83
|
+
"""DISTINCT / GROUP BY / COUNT in a store query — aggregate in ClickHouse."""
|
|
84
|
+
|
|
85
|
+
id = "no-aggregation-in-store-query"
|
|
86
|
+
code = "SARJ020"
|
|
87
|
+
description = (
|
|
88
|
+
"DISTINCT / GROUP BY / COUNT in a Postgres store query — push heavy "
|
|
89
|
+
"aggregation to the columnar mirror (ClickHouse / BigQuery)."
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
@override
|
|
93
|
+
def check(self, path: Path, source: str) -> list[Diagnostic]:
|
|
94
|
+
if _CLICKHOUSE_FILE.search(source):
|
|
95
|
+
return []
|
|
96
|
+
try:
|
|
97
|
+
tree = ast.parse(source, filename=str(path))
|
|
98
|
+
except SyntaxError:
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
diags: list[Diagnostic] = []
|
|
102
|
+
consumed: set[int] = set()
|
|
103
|
+
for node in ast.walk(tree):
|
|
104
|
+
if not isinstance(node, ast.Constant | ast.BinOp):
|
|
105
|
+
continue
|
|
106
|
+
if id(node) in consumed:
|
|
107
|
+
continue
|
|
108
|
+
text = _string_value(node)
|
|
109
|
+
if text is None:
|
|
110
|
+
continue
|
|
111
|
+
consumed.update(id(sub) for sub in ast.walk(node))
|
|
112
|
+
|
|
113
|
+
sql = _strip_sql_comments(text)
|
|
114
|
+
if _QUERY_SHAPE.search(sql) is None or _CLICKHOUSE_SQL.search(sql):
|
|
115
|
+
continue
|
|
116
|
+
found = [label for label, pat in _AGGREGATIONS if pat.search(sql)]
|
|
117
|
+
if not found:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
diags.append(
|
|
121
|
+
Diagnostic(
|
|
122
|
+
path=path,
|
|
123
|
+
line=node.lineno,
|
|
124
|
+
col=node.col_offset + 1,
|
|
125
|
+
code=self.code,
|
|
126
|
+
message=(
|
|
127
|
+
f"Store query uses {', '.join(found)} — push heavy "
|
|
128
|
+
"aggregation to ClickHouse / BigQuery, keep Postgres to "
|
|
129
|
+
"point/bounded reads. Suppress with `# sarj-noqa: SARJ020`."
|
|
130
|
+
),
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
diags.sort(key=lambda d: (d.line, d.col))
|
|
134
|
+
return diags
|