guarddog 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,15 @@
1
1
  import json
2
2
  import logging
3
+ import os
4
+ import re
5
+ from typing import List
6
+
3
7
  import requests
4
8
  from semantic_version import NpmSpec, Version # type:ignore
5
9
 
6
- from guarddog.utils.config import VERIFY_EXHAUSTIVE_DEPENDENCIES
7
10
  from guarddog.scanners.npm_package_scanner import NPMPackageScanner
8
- from guarddog.scanners.scanner import ProjectScanner
11
+ from guarddog.scanners.scanner import Dependency, DependencyVersion, ProjectScanner
12
+ from guarddog.utils.config import VERIFY_EXHAUSTIVE_DEPENDENCIES
9
13
 
10
14
  log = logging.getLogger("guarddog")
11
15
 
@@ -21,7 +25,7 @@ class NPMRequirementsScanner(ProjectScanner):
21
25
  def __init__(self) -> None:
22
26
  super().__init__(NPMPackageScanner())
23
27
 
24
- def parse_requirements(self, raw_requirements: str) -> dict:
28
+ def parse_requirements(self, raw_requirements: str) -> List[Dependency]:
25
29
  """
26
30
  Parses requirements.txt specification and finds all valid
27
31
  versions of each dependency
@@ -40,8 +44,8 @@ class NPMRequirementsScanner(ProjectScanner):
40
44
  }
41
45
  """
42
46
  package = json.loads(raw_requirements)
43
- dependencies = package["dependencies"] if "dependencies" in package else {}
44
- dev_dependencies = (
47
+ dependencies_attr = package["dependencies"] if "dependencies" in package else {}
48
+ dev_dependencies_attr = (
45
49
  package["devDependencies"] if "devDependencies" in package else {}
46
50
  )
47
51
 
@@ -82,23 +86,63 @@ class NPMRequirementsScanner(ProjectScanner):
82
86
  return versions
83
87
 
84
88
  merged = {} # type: dict[str, set[str]]
85
- for package, selector in list(dependencies.items()) + list(
86
- dev_dependencies.items()
89
+ for package, selector in list(dependencies_attr.items()) + list(
90
+ dev_dependencies_attr.items()
87
91
  ):
88
92
  if package not in merged:
89
93
  merged[package] = set()
90
94
  merged[package].add(selector)
91
95
 
92
- results = {}
96
+ dependencies: List[Dependency] = []
93
97
  for package, all_selectors in merged.items():
94
98
  versions = set() # type: set[str]
95
99
  for selector in all_selectors:
96
100
  versions = versions.union(
97
101
  get_matched_versions(find_all_versions(package), selector)
98
102
  )
103
+
99
104
  if len(versions) == 0:
100
105
  log.error(f"Package/Version {package} not on NPM\n")
101
106
  continue
102
107
 
103
- results[package] = versions
104
- return results
108
+ idx = next(
109
+ iter(
110
+ [
111
+ ix
112
+ for ix, line in enumerate(raw_requirements.splitlines())
113
+ if package in line
114
+ ]
115
+ ),
116
+ 0,
117
+ )
118
+
119
+ dep_versions = list(
120
+ map(
121
+ lambda d: DependencyVersion(version=d, location=idx + 1),
122
+ versions,
123
+ )
124
+ )
125
+
126
+ # find the dep with the same name or create a new one
127
+ dep = next(
128
+ filter(
129
+ lambda d: d.name == package,
130
+ dependencies,
131
+ ),
132
+ None,
133
+ )
134
+ if not dep:
135
+ dep = Dependency(name=package, versions=set())
136
+ dependencies.append(dep)
137
+
138
+ dep.versions.update(dep_versions)
139
+
140
+ return dependencies
141
+
142
+ def find_requirements(self, directory: str) -> list[str]:
143
+ requirement_files = []
144
+ for root, dirs, files in os.walk(directory):
145
+ for name in files:
146
+ if re.match(r"^package\.json$", name, flags=re.IGNORECASE):
147
+ requirement_files.append(os.path.join(root, name))
148
+ return requirement_files
@@ -1,11 +1,14 @@
1
1
  import logging
2
+ import os
2
3
  import re
4
+ from typing import List
5
+
3
6
  import pkg_resources
4
7
  import requests
5
8
  from packaging.specifiers import Specifier, Version
6
9
 
7
10
  from guarddog.scanners.pypi_package_scanner import PypiPackageScanner
8
- from guarddog.scanners.scanner import ProjectScanner
11
+ from guarddog.scanners.scanner import Dependency, DependencyVersion, ProjectScanner
9
12
  from guarddog.utils.config import VERIFY_EXHAUSTIVE_DEPENDENCIES
10
13
 
11
14
  log = logging.getLogger("guarddog")
@@ -37,17 +40,20 @@ class PypiRequirementsScanner(ProjectScanner):
37
40
 
38
41
  for line in requirements:
39
42
  is_requirement = re.match(r"\w", line)
40
- if is_requirement:
41
- if "\\" in line:
42
- line = line.replace("\\", "")
43
43
 
44
- stripped_line = line.strip()
45
- if len(stripped_line) > 0:
46
- sanitized_lines.append(stripped_line)
44
+ if not is_requirement:
45
+ sanitized_lines.append("") # empty line to keep the line number
46
+ continue
47
+
48
+ if "\\" in line:
49
+ line = line.replace("\\", "")
50
+
51
+ stripped_line = line.strip()
52
+ sanitized_lines.append(stripped_line)
47
53
 
48
54
  return sanitized_lines
49
55
 
50
- def parse_requirements(self, raw_requirements: str) -> dict[str, set[str]]:
56
+ def parse_requirements(self, raw_requirements: str) -> List[Dependency]:
51
57
  """
52
58
  Parses requirements.txt specification and finds all valid
53
59
  versions of each dependency
@@ -57,17 +63,10 @@ class PypiRequirementsScanner(ProjectScanner):
57
63
 
58
64
  Returns:
59
65
  dict: mapping of dependencies to valid versions
60
-
61
- ex.
62
- {
63
- ....
64
- <dependency-name>: [0.0.1, 0.0.2, ...],
65
- ...
66
- }
67
66
  """
68
67
  requirements = raw_requirements.splitlines()
69
68
  sanitized_requirements = self._sanitize_requirements(requirements)
70
- dependencies = {}
69
+ dependencies: List[Dependency] = []
71
70
 
72
71
  def get_matched_versions(versions: set[str], semver_range: str) -> set[str]:
73
72
  """
@@ -77,8 +76,11 @@ class PypiRequirementsScanner(ProjectScanner):
77
76
 
78
77
  # Filters to specified versions
79
78
  try:
80
- spec = Specifier(semver_range)
81
- result = [Version(m) for m in spec.filter(versions)]
79
+ matching_versions = versions
80
+ if semver_range:
81
+ spec = Specifier(semver_range)
82
+ matching_versions = set(spec.filter(versions))
83
+ result = [Version(m) for m in matching_versions]
82
84
  except ValueError:
83
85
  # use it raw
84
86
  return set([semver_range])
@@ -142,8 +144,47 @@ class PypiRequirementsScanner(ProjectScanner):
142
144
  )
143
145
  continue
144
146
 
145
- dependencies[requirement.project_name] = versions
147
+ idx = next(
148
+ iter(
149
+ [
150
+ ix
151
+ for ix, line in enumerate(requirements)
152
+ if str(requirement) in line
153
+ ]
154
+ ),
155
+ 0,
156
+ )
157
+
158
+ dep_versions = list(
159
+ map(
160
+ lambda d: DependencyVersion(version=d, location=idx + 1),
161
+ versions,
162
+ )
163
+ )
164
+
165
+ # find the dep with the same name or create a new one
166
+ dep = next(
167
+ filter(
168
+ lambda d: d.name == requirement.project_name,
169
+ dependencies,
170
+ ),
171
+ None,
172
+ )
173
+ if not dep:
174
+ dep = Dependency(name=requirement.project_name, versions=set())
175
+ dependencies.append(dep)
176
+
177
+ dep.versions.update(dep_versions)
178
+
146
179
  except Exception as e:
147
180
  log.error(f"Received error {str(e)}")
148
181
 
149
182
  return dependencies
183
+
184
+ def find_requirements(self, directory: str) -> list[str]:
185
+ requirement_files = []
186
+ for root, dirs, files in os.walk(directory):
187
+ for name in files:
188
+ if re.match(r"^requirements(-dev)?\.txt$", name, flags=re.IGNORECASE):
189
+ requirement_files.append(os.path.join(root, name))
190
+ return requirement_files
@@ -2,11 +2,12 @@ import concurrent.futures
2
2
  import json
3
3
  import logging
4
4
  import os
5
- import sys
6
5
  import tempfile
7
6
  import typing
8
7
  from abc import abstractmethod
9
8
  from concurrent.futures import ThreadPoolExecutor
9
+ from dataclasses import dataclass
10
+ from typing import List, Optional, Set, Tuple
10
11
 
11
12
  import requests
12
13
 
@@ -21,183 +22,65 @@ def noop(arg: typing.Any) -> None:
21
22
  pass
22
23
 
23
24
 
24
- class ProjectScanner:
25
- def __init__(self, package_scanner):
26
- super().__init__()
27
- self.package_scanner = package_scanner
28
-
29
- def _authenticate_by_access_token(self) -> tuple[str, str]:
30
- """
31
- Gives GitHub authentication through access token
32
-
33
- Returns:
34
- tuple[str, str]: username, personal access token
35
- """
36
-
37
- user = os.getenv("GIT_USERNAME")
38
- personal_access_token = os.getenv("GH_TOKEN")
39
- if not user or not personal_access_token:
40
- log.error(
41
- """WARNING: Please set GIT_USERNAME (Github handle) and GH_TOKEN
42
- (generate a personal access token in Github settings > developer)
43
- as environment variables before proceeding."""
44
- )
45
- exit(1)
46
- return (user, personal_access_token)
47
-
48
- def scan_requirements(
49
- self,
50
- requirements: str,
51
- rules=None,
52
- callback: typing.Callable[[dict], None] = noop,
53
- ) -> dict:
54
- """
55
- Reads the requirements.txt file and scans each possible
56
- dependency and version
57
-
58
- Args:
59
- requirements (str): contents of requirements.txt file
60
- rules: list of rules to apply
61
- callback: callback to call for each result
62
-
63
- Returns:
64
- dict: mapping of dependencies to scan results
65
-
66
- ex.
67
- {
68
- ....
69
- <dependency-name>: {
70
- issues: ...,
71
- results: {
72
- ...
73
- }
74
- },
75
- ...
76
- }
77
- """
78
-
79
- def scan_single_dependency(dependency, version):
80
- log.debug(f"Scanning {dependency} version {version}")
81
- result = self.package_scanner.scan_remote(dependency, version, rules)
82
- return {"dependency": dependency, "version": version, "result": result}
83
-
84
- dependencies = self.parse_requirements(requirements)
85
- num_workers = PARALLELISM
86
-
87
- log.info(
88
- f"Scanning using at most {num_workers} parallel worker threads\n"
89
- )
90
- with ThreadPoolExecutor(max_workers=num_workers) as pool:
91
- try:
92
- futures: typing.List[concurrent.futures.Future] = []
93
- for dependency, versions in dependencies.items():
94
- assert versions is None or len(versions) > 0
95
- if versions is None:
96
- # this will cause scan_remote to use the latest version
97
- futures.append(
98
- pool.submit(scan_single_dependency, dependency, None)
99
- )
100
- else:
101
- futures.extend(
102
- map(
103
- lambda version: pool.submit(
104
- scan_single_dependency, dependency, version
105
- ),
106
- versions,
107
- )
108
- )
109
-
110
- results = []
111
- for future in concurrent.futures.as_completed(futures):
112
- result = future.result()
113
- if callback is not None:
114
- callback(result)
115
- results.append(result)
116
- except KeyboardInterrupt:
117
- log.warning("Received keyboard interrupt, cancelling scan\n")
118
- pool.shutdown(wait=False, cancel_futures=True)
119
-
120
- return results # type: ignore
25
+ @dataclass
26
+ class DependencyVersion:
27
+ """
28
+ This class represents the identified dependency versions in a project,
29
+ usually defined in a specification file (requirements.txt, package.json, etc.)
121
30
 
122
- def scan_remote(self, url: str, branch: str, requirements_name: str) -> dict:
123
- """
124
- Scans remote requirements.txt file
31
+ Attributes:
32
+ version (str): The version of the dependency. e.g., "1.0.0"
33
+ location (int): This indicates the line number in the specification file where the dependency is defined.
34
+ """
125
35
 
126
- Args:
127
- url (str): url of the GitHub repo
128
- branch (str): branch containing requirements.txt
129
- requirements_name (str, optional): name of requirements file.
130
- Defaults to "requirements.txt".
36
+ version: str # the version number of the dependency
37
+ location: int
131
38
 
132
- Returns:
133
- dict: mapping of dependencies to scan results
39
+ def __eq__(self, other):
40
+ if isinstance(other, str):
41
+ return self.version == other
42
+ if isinstance(other, DependencyVersion):
43
+ return self.version == other.version
44
+ return NotImplemented
134
45
 
135
- ex.
136
- {
137
- ....
138
- <dependency-name>: {
139
- issues: ...,
140
- results: {
141
- ...
142
- }
143
- },
144
- ...
145
- }
146
- """
46
+ def __hash__(self):
47
+ return hash(self.version)
147
48
 
148
- token = self._authenticate_by_access_token()
149
- githubusercontent_url = url.replace("github", "raw.githubusercontent")
49
+ def __repr__(self):
50
+ return f"DependencyVersion({self.version!r})"
150
51
 
151
- req_url = f"{githubusercontent_url}/{branch}/{requirements_name}"
152
- resp = requests.get(url=req_url, auth=token)
153
52
 
154
- if resp.status_code == 200:
155
- return self.scan_requirements(resp.content.decode())
156
- else:
157
- log.error(
158
- f"{req_url} does not exist. Check your link or branch name."
159
- )
160
- sys.exit(255)
53
+ @dataclass
54
+ class Dependency:
55
+ """
56
+ This class represents a dependency in a project, usually defined in a specification file
161
57
 
162
- def scan_local(
163
- self, path, rules=None, callback: typing.Callable[[dict], None] = noop
164
- ):
165
- """
166
- Scans a local requirements.txt file
58
+ Attributes:
59
+ name (str): The name of the dependency. e.g., "requests"
60
+ versions (Set[DependencyVersion]): A set of identified versions of the dependency.
61
+ """
62
+ name: str
63
+ versions: Set[DependencyVersion]
167
64
 
168
- Args:
169
- path (str): path to requirements.txt file
170
- rules: list of rules to apply
171
- callback: callback to call for each result
65
+ def __eq__(self, other):
66
+ if isinstance(other, str):
67
+ return self.name == other
68
+ if isinstance(other, Dependency):
69
+ return self.name == other.name
70
+ return NotImplemented
172
71
 
173
- Returns:
174
- dict: mapping of dependencies to scan results
72
+ def __repr__(self):
73
+ return f"Dependency({self.name!r})"
175
74
 
176
- ex.
177
- {
178
- ....
179
- <dependency-name>: {
180
- issues: ...,
181
- results: {
182
- ...
183
- }
184
- },
185
- ...
186
- }
187
- """
188
75
 
189
- try:
190
- with open(path, "r") as f:
191
- return self.scan_requirements(f.read(), rules, callback)
192
- except Exception as e:
193
- log.error(f"Received {e}")
194
- sys.exit(255)
76
+ @dataclass
77
+ class DependencyFile:
78
+ """
79
+ This class represents a specification file for a project (requirements.txt, package.json, etc.)
80
+ """
195
81
 
196
- @abstractmethod
197
- def parse_requirements(
198
- self, raw_requirements: str
199
- ) -> dict[str, set[str]]: # returns { package: version }
200
- pass
82
+ file_path: str
83
+ dependencies: List[Dependency]
201
84
 
202
85
 
203
86
  class PackageScanner:
@@ -324,3 +207,202 @@ class PackageScanner:
324
207
  finally:
325
208
  log.debug(f"Removing temporary archive file {archive_path}")
326
209
  os.remove(archive_path)
210
+
211
+
212
+ class ProjectScanner:
213
+ def __init__(self, package_scanner: PackageScanner):
214
+ super().__init__()
215
+ self.package_scanner = package_scanner
216
+
217
+ def _authenticate_by_access_token(self) -> tuple[str, str]:
218
+ """
219
+ Gives GitHub authentication through access token
220
+
221
+ Returns:
222
+ tuple[str, str]: username, personal access token
223
+ """
224
+
225
+ user = os.getenv("GIT_USERNAME")
226
+ personal_access_token = os.getenv("GH_TOKEN")
227
+ if not user or not personal_access_token:
228
+ log.error(
229
+ """WARNING: Please set GIT_USERNAME (Github handle) and GH_TOKEN
230
+ (generate a personal access token in Github settings > developer)
231
+ as environment variables before proceeding."""
232
+ )
233
+ exit(1)
234
+ return (user, personal_access_token)
235
+
236
+ def scan_dependencies(
237
+ self,
238
+ dependencies: List[Dependency],
239
+ rules=None,
240
+ callback: typing.Callable[[dict], None] = noop,
241
+ ) -> list[dict]:
242
+ """
243
+ scans each possible dependency and version supplied
244
+
245
+ Args:
246
+ dependencies a list of dependencies to scan
247
+ rules: list of rules to apply
248
+ callback: callback to call for each result
249
+
250
+ Returns:
251
+ dict: mapping of dependencies to scan results
252
+
253
+ ex.
254
+ {
255
+ ....
256
+ <dependency-name>: {
257
+ issues: ...,
258
+ results: {
259
+ ...
260
+ }
261
+ },
262
+ ...
263
+ }
264
+ """
265
+
266
+ def scan_single_dependency(dependency: str, version: Optional[str]) -> dict:
267
+ log.debug(f"Scanning {dependency} version {version}")
268
+ result = self.package_scanner.scan_remote(dependency, version, rules)
269
+ return {"dependency": dependency, "version": version, "result": result}
270
+
271
+ num_workers = PARALLELISM
272
+
273
+ log.info(f"Scanning using at most {num_workers} parallel worker threads\n")
274
+ with ThreadPoolExecutor(max_workers=num_workers) as pool:
275
+ try:
276
+ futures: typing.List[concurrent.futures.Future] = []
277
+ for dependency in dependencies:
278
+ versions = dependency.versions
279
+ if not versions:
280
+ # this will cause scan_remote to use the latest version
281
+ futures.append(
282
+ pool.submit(scan_single_dependency, dependency.name, None)
283
+ )
284
+ else:
285
+ futures.extend(
286
+ map(
287
+ lambda version: pool.submit(
288
+ scan_single_dependency,
289
+ dependency.name,
290
+ version.version,
291
+ ),
292
+ versions,
293
+ )
294
+ )
295
+
296
+ results = []
297
+ for future in concurrent.futures.as_completed(futures):
298
+ result = future.result()
299
+ if callback is not None:
300
+ callback(result)
301
+ results.append(result)
302
+ except KeyboardInterrupt:
303
+ log.warning("Received keyboard interrupt, cancelling scan\n")
304
+ pool.shutdown(wait=False, cancel_futures=True)
305
+
306
+ return results
307
+
308
+ def scan_remote(
309
+ self, url: str, branch: str, requirements_name: str
310
+ ) -> tuple[List[Dependency], list[dict]]:
311
+ """
312
+ Scans remote requirements.txt file
313
+
314
+ Args:
315
+ url (str): url of the GitHub repo
316
+ branch (str): branch containing requirements.txt
317
+ requirements_name (str, optional): name of requirements file.
318
+ Defaults to "requirements.txt".
319
+
320
+ Returns:
321
+ deps: list of dependencies to scan
322
+ results: mapping of dependencies to scan results
323
+ ex.
324
+ {
325
+ ....
326
+ <dependency-name>: {
327
+ issues: ...,
328
+ results: {
329
+ ...
330
+ }
331
+ },
332
+ ...
333
+ }
334
+ """
335
+
336
+ token = self._authenticate_by_access_token()
337
+ githubusercontent_url = url.replace("github", "raw.githubusercontent")
338
+ req_url = f"{githubusercontent_url}/{branch}/{requirements_name}"
339
+ resp = requests.get(url=req_url, auth=token)
340
+ resp.raise_for_status()
341
+ dependencies = self.parse_requirements(resp.content.decode())
342
+ return dependencies, self.scan_dependencies(dependencies)
343
+
344
+ def scan_local(
345
+ self, path, rules=None, callback: typing.Callable[[dict], None] = noop
346
+ ) -> Tuple[List[DependencyFile], list[dict]]:
347
+ """
348
+ Scans a local requirements files (requirements.txt, package.json, etc.)
349
+
350
+ Args:
351
+ path (str): path to requirements file or directory to search
352
+ rules: list of rules to apply
353
+ callback: callback to call for each result
354
+
355
+ Returns:
356
+ deps: list of dependencies to scan
357
+ results: mapping of dependencies to scan results
358
+ ex.
359
+ {
360
+ ....
361
+ <dependency-name>: {
362
+ issues: ...,
363
+ results: {
364
+ ...
365
+ }
366
+ },
367
+ ...
368
+ }
369
+
370
+ """
371
+
372
+ requirement_paths = []
373
+
374
+ try:
375
+ if os.path.isfile(path):
376
+ requirement_paths.append(path)
377
+ elif os.path.isdir(path):
378
+ requirement_paths.extend(self.find_requirements(path))
379
+ else:
380
+ raise ValueError(f"unable to find file or directory {path}")
381
+
382
+ dep_files: List[DependencyFile] = []
383
+
384
+ for req in requirement_paths:
385
+ with open(req, "r") as f:
386
+ dep_files.append(
387
+ DependencyFile(
388
+ file_path=req,
389
+ dependencies=self.parse_requirements(f.read()),
390
+ )
391
+ )
392
+ deps_to_scan = [d for d_file in dep_files for d in d_file.dependencies]
393
+ results = self.scan_dependencies(deps_to_scan, rules, callback)
394
+ return dep_files, results
395
+ except Exception as e:
396
+ log.error(f"Error while scanning. Received {e}")
397
+ raise e
398
+
399
+ @abstractmethod
400
+ def parse_requirements(self, raw_requirements: str) -> List[Dependency]:
401
+ pass
402
+
403
+ @abstractmethod
404
+ def find_requirements(
405
+ self,
406
+ directory: str,
407
+ ) -> list[str]: # returns paths of files
408
+ pass