codepathfinder 1.2.0__py3-none-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codepathfinder/ir.py ADDED
@@ -0,0 +1,107 @@
1
+ """
2
+ JSON Intermediate Representation (IR) for pathfinder DSL.
3
+
4
+ The Python DSL serializes to JSON IR, which the Go executor consumes.
5
+ This enables language-agnostic pattern definitions (future: JS, Rust DSLs).
6
+ """
7
+
8
+ from enum import Enum
9
+ from typing import Any, Dict, Protocol
10
+
11
+
12
+ class IRType(Enum):
13
+ """IR node types for different matchers and combinators."""
14
+
15
+ CALL_MATCHER = "call_matcher"
16
+ VARIABLE_MATCHER = "variable_matcher"
17
+ DATAFLOW = "dataflow" # Coming in PR #3
18
+ LOGIC_AND = "logic_and" # Coming in PR #5
19
+ LOGIC_OR = "logic_or" # Coming in PR #5
20
+ LOGIC_NOT = "logic_not" # Coming in PR #5
21
+
22
+
23
+ class MatcherIR(Protocol):
24
+ """Protocol for all matcher types (duck typing)."""
25
+
26
+ def to_ir(self) -> Dict[str, Any]:
27
+ """Serialize to JSON IR dictionary."""
28
+ ...
29
+
30
+
31
+ def serialize_ir(matcher: MatcherIR) -> Dict[str, Any]:
32
+ """
33
+ Serialize any matcher to JSON IR.
34
+
35
+ Args:
36
+ matcher: Any object implementing MatcherIR protocol
37
+
38
+ Returns:
39
+ JSON-serializable dictionary
40
+
41
+ Raises:
42
+ AttributeError: If matcher doesn't implement to_ir()
43
+ """
44
+ if not hasattr(matcher, "to_ir"):
45
+ raise AttributeError(f"{type(matcher).__name__} must implement to_ir() method")
46
+
47
+ return matcher.to_ir()
48
+
49
+
50
+ def validate_ir(ir: Dict[str, Any]) -> bool:
51
+ """
52
+ Validate JSON IR structure.
53
+
54
+ Args:
55
+ ir: JSON IR dictionary
56
+
57
+ Returns:
58
+ True if valid, False otherwise
59
+
60
+ Validates:
61
+ - "type" field exists and is valid IRType
62
+ - Required fields present for each type
63
+ """
64
+ if "type" not in ir:
65
+ return False
66
+
67
+ try:
68
+ ir_type = IRType(ir["type"])
69
+ except ValueError:
70
+ return False
71
+
72
+ # Type-specific validation
73
+ if ir_type == IRType.CALL_MATCHER:
74
+ return (
75
+ "patterns" in ir
76
+ and isinstance(ir["patterns"], list)
77
+ and len(ir["patterns"]) > 0
78
+ and "wildcard" in ir
79
+ and isinstance(ir["wildcard"], bool)
80
+ )
81
+
82
+ if ir_type == IRType.VARIABLE_MATCHER:
83
+ return (
84
+ "pattern" in ir
85
+ and isinstance(ir["pattern"], str)
86
+ and len(ir["pattern"]) > 0
87
+ and "wildcard" in ir
88
+ and isinstance(ir["wildcard"], bool)
89
+ )
90
+
91
+ if ir_type == IRType.DATAFLOW:
92
+ return (
93
+ "sources" in ir
94
+ and isinstance(ir["sources"], list)
95
+ and len(ir["sources"]) > 0
96
+ and "sinks" in ir
97
+ and isinstance(ir["sinks"], list)
98
+ and len(ir["sinks"]) > 0
99
+ and "sanitizers" in ir
100
+ and isinstance(ir["sanitizers"], list)
101
+ and "propagation" in ir
102
+ and isinstance(ir["propagation"], list)
103
+ and "scope" in ir
104
+ and ir["scope"] in ["local", "global"]
105
+ )
106
+
107
+ return True
@@ -0,0 +1,101 @@
1
+ """Logic operators for combining matchers."""
2
+
3
+ from typing import Union
4
+ from .matchers import CallMatcher, VariableMatcher
5
+ from .dataflow import DataflowMatcher
6
+ from .ir import IRType
7
+
8
+ MatcherType = Union[
9
+ CallMatcher,
10
+ VariableMatcher,
11
+ DataflowMatcher,
12
+ "AndOperator",
13
+ "OrOperator",
14
+ "NotOperator",
15
+ ]
16
+
17
+
18
+ class AndOperator:
19
+ """
20
+ Logical AND - all matchers must match.
21
+
22
+ Example:
23
+ And(calls("eval"), variable("user_input"))
24
+ # Matches code that has BOTH eval calls AND user_input variable
25
+ """
26
+
27
+ def __init__(self, *matchers: MatcherType):
28
+ if len(matchers) < 2:
29
+ raise ValueError("And() requires at least 2 matchers")
30
+ self.matchers = list(matchers)
31
+
32
+ def to_ir(self) -> dict:
33
+ return {
34
+ "type": IRType.LOGIC_AND.value,
35
+ "matchers": [m.to_ir() for m in self.matchers],
36
+ }
37
+
38
+ def __repr__(self) -> str:
39
+ return f"And({len(self.matchers)} matchers)"
40
+
41
+
42
+ class OrOperator:
43
+ """
44
+ Logical OR - at least one matcher must match.
45
+
46
+ Example:
47
+ Or(calls("eval"), calls("exec"))
48
+ # Matches code with eval OR exec
49
+ """
50
+
51
+ def __init__(self, *matchers: MatcherType):
52
+ if len(matchers) < 2:
53
+ raise ValueError("Or() requires at least 2 matchers")
54
+ self.matchers = list(matchers)
55
+
56
+ def to_ir(self) -> dict:
57
+ return {
58
+ "type": IRType.LOGIC_OR.value,
59
+ "matchers": [m.to_ir() for m in self.matchers],
60
+ }
61
+
62
+ def __repr__(self) -> str:
63
+ return f"Or({len(self.matchers)} matchers)"
64
+
65
+
66
+ class NotOperator:
67
+ """
68
+ Logical NOT - matcher must NOT match.
69
+
70
+ Example:
71
+ Not(calls("test_*"))
72
+ # Matches code that does NOT call test_* functions
73
+ """
74
+
75
+ def __init__(self, matcher: MatcherType):
76
+ self.matcher = matcher
77
+
78
+ def to_ir(self) -> dict:
79
+ return {
80
+ "type": IRType.LOGIC_NOT.value,
81
+ "matcher": self.matcher.to_ir(),
82
+ }
83
+
84
+ def __repr__(self) -> str:
85
+ return f"Not({repr(self.matcher)})"
86
+
87
+
88
+ # Public API
89
+ def And(*matchers: MatcherType) -> AndOperator:
90
+ """Create AND combinator."""
91
+ return AndOperator(*matchers)
92
+
93
+
94
+ def Or(*matchers: MatcherType) -> OrOperator:
95
+ """Create OR combinator."""
96
+ return OrOperator(*matchers)
97
+
98
+
99
+ def Not(matcher: MatcherType) -> NotOperator:
100
+ """Create NOT combinator."""
101
+ return NotOperator(matcher)
@@ -0,0 +1,243 @@
1
+ """
2
+ Core matchers for the pathfinder Python DSL.
3
+
4
+ These matchers generate JSON IR for the Go executor.
5
+ """
6
+
7
+ from typing import Dict, Optional, Union, List, Any
8
+ from .ir import IRType
9
+
10
+ ArgumentValue = Union[str, int, float, bool, List[Union[str, int, float, bool]]]
11
+
12
+
13
+ class CallMatcher:
14
+ """
15
+ Matches function/method calls with optional argument constraints.
16
+
17
+ Examples:
18
+ calls("eval") # Exact match
19
+ calls("eval", "exec") # Multiple patterns
20
+ calls("request.*") # Wildcard (any request.* call)
21
+ calls("*.json") # Wildcard (any *.json call)
22
+ calls("app.run", match_name={"debug": True}) # Keyword argument matching
23
+ calls("open", match_position={1: "w"}) # Positional argument matching
24
+ calls("socket.bind", match_position={"0[0]": "0.0.0.0"}) # Tuple indexing
25
+ calls("connect", match_position={"0[0]": "192.168.*"}) # Wildcard + tuple
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ *patterns: str,
31
+ match_position: Optional[Dict[int, ArgumentValue]] = None,
32
+ match_name: Optional[Dict[str, ArgumentValue]] = None,
33
+ ):
34
+ """
35
+ Args:
36
+ *patterns: Function names to match. Supports wildcards (*).
37
+ match_position: Match positional arguments by index or tuple index.
38
+ Examples: {0: "value"}, {1: ["a", "b"]}, {"0[0]": "0.0.0.0"}
39
+ match_name: Match named/keyword arguments {name: value}
40
+
41
+ Position indexing:
42
+ - Simple: {0: "value"} matches first argument
43
+ - Tuple: {"0[0]": "value"} matches first element of first argument tuple
44
+ - Wildcard: {"0[0]": "192.168.*"} matches with wildcard pattern
45
+
46
+ Raises:
47
+ ValueError: If no patterns provided or pattern is empty
48
+ """
49
+ if not patterns:
50
+ raise ValueError("calls() requires at least one pattern")
51
+
52
+ if any(not p or not isinstance(p, str) for p in patterns):
53
+ raise ValueError("All patterns must be non-empty strings")
54
+
55
+ self.patterns = list(patterns)
56
+ self.wildcard = any("*" in p for p in patterns)
57
+ self.match_position = match_position or {}
58
+ self.match_name = match_name or {}
59
+
60
+ def _make_constraint(self, value: ArgumentValue) -> Dict[str, Any]:
61
+ """
62
+ Create an argument constraint from a value.
63
+
64
+ Automatically detects wildcard characters in string values.
65
+
66
+ Args:
67
+ value: The argument value or list of values
68
+
69
+ Returns:
70
+ Dictionary with 'value' and 'wildcard' keys
71
+ """
72
+ # Check if wildcard characters are present in string values
73
+ # NOTE: Argument wildcard is independent of pattern wildcard (self.wildcard)
74
+ # Pattern wildcard applies to function name matching (e.g., "*.bind")
75
+ # Argument wildcard applies to argument value matching (e.g., "192.168.*")
76
+ has_wildcard = False
77
+ if isinstance(value, str) and ("*" in value or "?" in value):
78
+ has_wildcard = True
79
+ elif isinstance(value, list):
80
+ has_wildcard = any(
81
+ isinstance(v, str) and ("*" in v or "?" in v) for v in value
82
+ )
83
+
84
+ return {"value": value, "wildcard": has_wildcard}
85
+
86
+ def to_ir(self) -> dict:
87
+ """
88
+ Serialize to JSON IR for Go executor.
89
+
90
+ Returns:
91
+ {
92
+ "type": "call_matcher",
93
+ "patterns": ["eval", "exec"],
94
+ "wildcard": false,
95
+ "matchMode": "any",
96
+ "keywordArgs": { "debug": {"value": true, "wildcard": false} },
97
+ "positionalArgs": { "0": {"value": "0.0.0.0", "wildcard": false} }
98
+ }
99
+ """
100
+ ir = {
101
+ "type": IRType.CALL_MATCHER.value,
102
+ "patterns": self.patterns,
103
+ "wildcard": self.wildcard,
104
+ "matchMode": "any",
105
+ }
106
+
107
+ # Add positional argument constraints
108
+ if self.match_position:
109
+ positional_args = {}
110
+ for pos, value in self.match_position.items():
111
+ constraint = self._make_constraint(value)
112
+ # Propagate wildcard flag from pattern to argument constraints
113
+ if self.wildcard:
114
+ constraint["wildcard"] = True
115
+ positional_args[str(pos)] = constraint
116
+ ir["positionalArgs"] = positional_args
117
+
118
+ # Add keyword argument constraints
119
+ if self.match_name:
120
+ keyword_args = {}
121
+ for name, value in self.match_name.items():
122
+ constraint = self._make_constraint(value)
123
+ # Propagate wildcard flag from pattern to argument constraints
124
+ if self.wildcard:
125
+ constraint["wildcard"] = True
126
+ keyword_args[name] = constraint
127
+ ir["keywordArgs"] = keyword_args
128
+
129
+ return ir
130
+
131
+ def __repr__(self) -> str:
132
+ patterns_str = ", ".join(f'"{p}"' for p in self.patterns)
133
+ return f"calls({patterns_str})"
134
+
135
+
136
+ class VariableMatcher:
137
+ """
138
+ Matches variable references by name.
139
+
140
+ Examples:
141
+ variable("user_input") # Exact match
142
+ variable("user_*") # Wildcard prefix
143
+ variable("*_id") # Wildcard suffix
144
+ """
145
+
146
+ def __init__(self, pattern: str):
147
+ """
148
+ Args:
149
+ pattern: Variable name pattern. Supports wildcards (*).
150
+
151
+ Raises:
152
+ ValueError: If pattern is empty
153
+ """
154
+ if not pattern or not isinstance(pattern, str):
155
+ raise ValueError("variable() requires a non-empty string pattern")
156
+
157
+ self.pattern = pattern
158
+ self.wildcard = "*" in pattern
159
+
160
+ def to_ir(self) -> dict:
161
+ """
162
+ Serialize to JSON IR for Go executor.
163
+
164
+ Returns:
165
+ {
166
+ "type": "variable_matcher",
167
+ "pattern": "user_input",
168
+ "wildcard": false
169
+ }
170
+ """
171
+ return {
172
+ "type": IRType.VARIABLE_MATCHER.value,
173
+ "pattern": self.pattern,
174
+ "wildcard": self.wildcard,
175
+ }
176
+
177
+ def __repr__(self) -> str:
178
+ return f'variable("{self.pattern}")'
179
+
180
+
181
+ # Public API
182
+ def calls(
183
+ *patterns: str,
184
+ match_position: Optional[Dict[int, ArgumentValue]] = None,
185
+ match_name: Optional[Dict[str, ArgumentValue]] = None,
186
+ ) -> CallMatcher:
187
+ """
188
+ Create a matcher for function/method calls with optional argument constraints.
189
+
190
+ Args:
191
+ *patterns: Function names to match (supports wildcards)
192
+ match_position: Match positional arguments by index {position: value}
193
+ match_name: Match named/keyword arguments {name: value}
194
+
195
+ Returns:
196
+ CallMatcher instance
197
+
198
+ Examples:
199
+ >>> calls("eval")
200
+ calls("eval")
201
+
202
+ >>> calls("request.GET", "request.POST")
203
+ calls("request.GET", "request.POST")
204
+
205
+ >>> calls("urllib.*")
206
+ calls("urllib.*")
207
+
208
+ >>> calls("app.run", match_name={"debug": True})
209
+ calls("app.run")
210
+
211
+ >>> calls("socket.bind", match_position={0: "0.0.0.0"})
212
+ calls("socket.bind")
213
+
214
+ >>> calls("yaml.load", match_position={1: ["Loader", "UnsafeLoader"]})
215
+ calls("yaml.load")
216
+
217
+ >>> calls("chmod", match_position={1: "0o7*"})
218
+ calls("chmod")
219
+
220
+ >>> calls("app.run", match_position={0: "localhost"}, match_name={"debug": True})
221
+ calls("app.run")
222
+ """
223
+ return CallMatcher(*patterns, match_position=match_position, match_name=match_name)
224
+
225
+
226
+ def variable(pattern: str) -> VariableMatcher:
227
+ """
228
+ Create a matcher for variable references.
229
+
230
+ Args:
231
+ pattern: Variable name pattern (supports wildcards)
232
+
233
+ Returns:
234
+ VariableMatcher instance
235
+
236
+ Examples:
237
+ >>> variable("user_input")
238
+ variable("user_input")
239
+
240
+ >>> variable("*_id")
241
+ variable("*_id")
242
+ """
243
+ return VariableMatcher(pattern)
@@ -0,0 +1,135 @@
1
+ """
2
+ Propagation presets for common use cases.
3
+
4
+ Presets bundle propagation primitives for convenience.
5
+ """
6
+
7
+ from typing import List
8
+ from .propagation import propagates, PropagationPrimitive
9
+
10
+
11
+ class PropagationPresets:
12
+ """
13
+ Common propagation bundles.
14
+
15
+ Developers can use presets instead of manually listing primitives.
16
+ """
17
+
18
+ @staticmethod
19
+ def minimal() -> List[PropagationPrimitive]:
20
+ """
21
+ Bare minimum propagation (fastest, least false negatives).
22
+
23
+ Covers:
24
+ - Variable assignments
25
+ - Function arguments
26
+
27
+ Coverage: ~40% of real-world flows
28
+ Performance: Fastest (minimal overhead)
29
+ False negatives: Higher (misses return values, strings)
30
+
31
+ Use when:
32
+ - Performance is critical
33
+ - You only care about direct variable flows
34
+
35
+ Example:
36
+ flows(
37
+ from_sources=calls("request.GET"),
38
+ to_sinks=calls("eval"),
39
+ propagates_through=PropagationPresets.minimal(),
40
+ scope="local"
41
+ )
42
+ """
43
+ return [
44
+ propagates.assignment(),
45
+ propagates.function_args(),
46
+ ]
47
+
48
+ @staticmethod
49
+ def standard() -> List[PropagationPrimitive]:
50
+ """
51
+ Recommended default (good balance).
52
+
53
+ Covers:
54
+ - Phase 1: assignment, function_args, function_returns
55
+ - Phase 2: string_concat, string_format
56
+
57
+ Coverage: ~75-80% of real-world flows
58
+ Performance: Good (moderate overhead)
59
+ False negatives: Lower
60
+
61
+ Use when:
62
+ - General-purpose taint analysis
63
+ - OWASP Top 10 detection
64
+ - Good balance of coverage and performance
65
+
66
+ Example:
67
+ flows(
68
+ from_sources=calls("request.*"),
69
+ to_sinks=calls("execute"),
70
+ propagates_through=PropagationPresets.standard(),
71
+ scope="global"
72
+ )
73
+ """
74
+ return [
75
+ propagates.assignment(),
76
+ propagates.function_args(),
77
+ propagates.function_returns(),
78
+ propagates.string_concat(),
79
+ propagates.string_format(),
80
+ ]
81
+
82
+ @staticmethod
83
+ def comprehensive() -> List[PropagationPrimitive]:
84
+ """
85
+ All MVP primitives (Phase 1 + Phase 2).
86
+
87
+ Covers:
88
+ - All standard() primitives
89
+
90
+ Coverage: ~80% of real-world flows
91
+ Performance: Moderate
92
+ False negatives: Low
93
+
94
+ Use when:
95
+ - Maximum coverage within MVP scope
96
+ - Willing to accept moderate performance overhead
97
+
98
+ Example:
99
+ flows(
100
+ from_sources=calls("request.*"),
101
+ to_sinks=calls("eval"),
102
+ propagates_through=PropagationPresets.comprehensive(),
103
+ scope="global"
104
+ )
105
+ """
106
+ return PropagationPresets.standard() # For MVP, comprehensive = standard
107
+
108
+ @staticmethod
109
+ def exhaustive() -> List[PropagationPrimitive]:
110
+ """
111
+ All primitives (Phase 1-6, POST-MVP).
112
+
113
+ NOTE: For MVP, this is same as comprehensive().
114
+ Post-MVP will include collections, control flow, OOP, advanced.
115
+
116
+ Coverage: ~95% of real-world flows (POST-MVP)
117
+ Performance: Slower (comprehensive analysis)
118
+ False negatives: Minimal
119
+
120
+ Use when:
121
+ - Maximum security coverage required
122
+ - Performance is not a concern
123
+ - Production-critical code
124
+
125
+ Example:
126
+ flows(
127
+ from_sources=calls("request.*"),
128
+ to_sinks=calls("execute"),
129
+ propagates_through=PropagationPresets.exhaustive(),
130
+ scope="global"
131
+ )
132
+ """
133
+ # MVP: same as comprehensive
134
+ # POST-MVP: will include Phase 3-6 primitives
135
+ return PropagationPresets.comprehensive()