better-git-of-theseus 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,643 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright 2016 Erik Bernhardsson
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import datetime
19
+ import functools
20
+ import json
21
+ import multiprocessing
22
+ import os
23
+ import signal
24
+ import warnings
25
+ import threading
26
+ from pathlib import Path
27
+
28
+ import git
29
+ import pygments.lexers
30
+ from tqdm import tqdm
31
+ from wcmatch import fnmatch
32
+
33
+ def safe_signal(signum, handler):
34
+ try:
35
+ signal.signal(signum, handler)
36
+ except ValueError:
37
+ # Cannot set signal handler in non-main thread
38
+ pass
39
+
40
+ # Some filetypes in Pygments are not necessarily computer code, but configuration/documentation. Let's not include those.
41
+ IGNORE_PYGMENTS_FILETYPES = [
42
+ "*.json",
43
+ "*.md",
44
+ "*.ps",
45
+ "*.eps",
46
+ "*.txt",
47
+ "*.xml",
48
+ "*.xsl",
49
+ "*.rss",
50
+ "*.xslt",
51
+ "*.xsd",
52
+ "*.wsdl",
53
+ "*.wsf",
54
+ "*.yaml",
55
+ "*.yml",
56
+ ]
57
+
58
+ default_filetypes = set()
59
+ for _, _, filetypes, _ in pygments.lexers.get_all_lexers():
60
+ default_filetypes.update(filetypes)
61
+ default_filetypes.difference_update(IGNORE_PYGMENTS_FILETYPES)
62
+
63
+
64
+ class MiniEntry:
65
+ def __init__(self, entry):
66
+ self.path = entry.path
67
+ self.binsha = entry.binsha
68
+
69
+
70
+ class MiniCommit:
71
+ def __init__(self, commit):
72
+ self.hexsha = commit.hexsha
73
+ self.committed_date = commit.committed_date
74
+
75
+
76
+ def get_top_dir(path):
77
+ return (
78
+ os.path.dirname(path).split("/")[0] + "/"
79
+ ) # Git/GitPython on Windows also returns paths with '/'s
80
+
81
+
82
+ class BlameProc(multiprocessing.Process):
83
+ def __init__(
84
+ self, repo_dir, q, ret_q, run_flag, blame_kwargs, commit2cohort, use_mailmap
85
+ ):
86
+ super().__init__(daemon=True)
87
+ self.repo: git.Repo = git.Repo(repo_dir)
88
+ self.q: multiprocessing.Queue = q
89
+ self.ret_q: multiprocessing.Queue = ret_q
90
+ self.run_flag: multiprocessing.Event = run_flag
91
+ self.blame_kwargs = dict(blame_kwargs)
92
+ self.commit2cohort = commit2cohort # On Unix systems if process is started via the `fork` method, could make this a copy-on-write variable to save RAM
93
+ self.use_mailmap = use_mailmap
94
+
95
+ # Get Blame data for a `file` at `commit`
96
+ def get_file_histogram(self, path, commit):
97
+ h = {}
98
+ try:
99
+ for old_commit, lines in self.repo.blame(commit, path, **self.blame_kwargs):
100
+ cohort = self.commit2cohort.get(old_commit.binsha, "MISSING")
101
+ _, ext = os.path.splitext(path)
102
+ if self.use_mailmap:
103
+ author_name, author_email = get_mailmap_author_name_email(
104
+ self.repo, old_commit.author.name, old_commit.author.email
105
+ )
106
+ else:
107
+ author_name, author_email = (
108
+ old_commit.author.name,
109
+ old_commit.author.email,
110
+ )
111
+ keys = [
112
+ ("cohort", cohort),
113
+ ("ext", ext),
114
+ ("author", author_name),
115
+ ("dir", get_top_dir(path)),
116
+ ("domain", author_email.split("@")[-1]),
117
+ ]
118
+
119
+ if old_commit.binsha in self.commit2cohort:
120
+ keys.append(("sha", old_commit.hexsha))
121
+
122
+ for key in keys:
123
+ h[key] = h.get(key, 0) + len(lines)
124
+ except:
125
+ pass
126
+ return h
127
+
128
+ def run(self):
129
+ safe_signal(signal.SIGINT, signal.SIG_IGN)
130
+ try:
131
+ while self.run_flag.wait():
132
+ entry, commit = self.q.get()
133
+ if not commit:
134
+ return
135
+ self.ret_q.put((entry, self.get_file_histogram(entry, commit)))
136
+ except:
137
+ raise
138
+
139
+
140
+ class BlameDriver:
141
+ def __init__(
142
+ self,
143
+ repo_dir,
144
+ proc_count,
145
+ last_file_y,
146
+ cur_y,
147
+ blame_kwargs,
148
+ commit2cohort,
149
+ use_mailmap,
150
+ quiet,
151
+ ):
152
+ self.repo_dir = repo_dir
153
+ self.proc_count = proc_count
154
+ self.q = multiprocessing.Queue()
155
+ self.ret_q = multiprocessing.Queue()
156
+ self.run_flag = multiprocessing.Event()
157
+ self.run_flag.set()
158
+ self.last_file_y = last_file_y
159
+ self.cur_y = cur_y
160
+ self.blame_kwargs = blame_kwargs
161
+ self.commit2cohort = commit2cohort
162
+ self.use_mailmap = use_mailmap
163
+ self.quiet = quiet
164
+ self.proc_pool = []
165
+ self.spawn_process(self.proc_count)
166
+
167
+ def spawn_process(self, spawn_only=False):
168
+ n = self.proc_count - len(self.proc_pool)
169
+ if n == 0:
170
+ return
171
+ if n < 0:
172
+ return None if spawn_only else self._despawn_process(-n)
173
+ if not self.quiet:
174
+ print("\n\nStarting up processes: ", end="")
175
+ for i in range(n):
176
+ self.proc_pool.append(
177
+ BlameProc(
178
+ self.repo_dir,
179
+ self.q,
180
+ self.ret_q,
181
+ self.run_flag,
182
+ self.blame_kwargs,
183
+ self.commit2cohort,
184
+ self.use_mailmap,
185
+ )
186
+ )
187
+ self.proc_pool[-1].start()
188
+ if not self.quiet:
189
+ print(
190
+ ("" if i == 0 else ", ") + self.proc_pool[-1].name,
191
+ end="\n" if i == n - 1 else "",
192
+ )
193
+
194
+ def _despawn_process(self, n):
195
+ for i in range(n):
196
+ self.q.put((None, None))
197
+
198
+ print("\n")
199
+ while True:
200
+ print("\rShutting down processes: ", end="")
201
+ killed_processes = 0
202
+ for idx, proc in enumerate(self.proc_pool):
203
+ if proc.is_alive():
204
+ continue
205
+ else:
206
+ print(
207
+ ("" if killed_processes == 0 else ", ") + proc.name,
208
+ end="\n" if killed_processes == n - 1 else "",
209
+ )
210
+ killed_processes += 1
211
+ if killed_processes >= n:
212
+ for proc in self.proc_pool:
213
+ if not proc.is_alive():
214
+ proc.join()
215
+ self.proc_pool = [proc for proc in self.proc_pool if proc.is_alive()]
216
+ return
217
+
218
+ def fetch(self, commit, check_entries, bar):
219
+ self.spawn_process()
220
+ processed_entries = 0
221
+ total_entries = len(check_entries)
222
+
223
+ for entry in check_entries:
224
+ self.q.put((entry.path, commit.hexsha))
225
+
226
+ while processed_entries < total_entries:
227
+ path, file_y = self.ret_q.get()
228
+
229
+ for key_tuple, file_locs in file_y.items():
230
+ self.cur_y[key_tuple] = self.cur_y.get(key_tuple, 0) + file_locs
231
+ self.last_file_y[path] = file_y
232
+
233
+ processed_entries += 1
234
+ self.run_flag.wait()
235
+ bar.update()
236
+
237
+ return self.cur_y
238
+
239
+ def pause(self):
240
+ self.run_flag.clear()
241
+
242
+ def resume(self):
243
+ self.run_flag.set()
244
+
245
+
246
+ def analyze(
247
+ repo_dir,
248
+ cohortfm="%Y",
249
+ interval=7 * 24 * 60 * 60,
250
+ ignore=[],
251
+ only=[],
252
+ outdir=".",
253
+ branch="master",
254
+ all_filetypes=False,
255
+ ignore_whitespace=False,
256
+ procs=2,
257
+ quiet=False,
258
+ opt=False,
259
+ ):
260
+ use_mailmap = (Path(repo_dir) / ".mailmap").exists()
261
+ repo = git.Repo(repo_dir)
262
+ blame_kwargs = {}
263
+ if ignore_whitespace:
264
+ blame_kwargs["w"] = True
265
+ master_commits = [] # only stores a subset
266
+ commit2cohort = {}
267
+ curve_key_tuples = set() # Keys of each curve that will be tracked
268
+ tqdm_args = {
269
+ "smoothing": 0.025, # Exponential smoothing is still rather jumpy, a tiny number will do
270
+ "disable": quiet,
271
+ "dynamic_ncols": True,
272
+ }
273
+
274
+ if outdir and not os.path.exists(outdir):
275
+ os.makedirs(outdir)
276
+
277
+ # Check if specified branch exists
278
+ try:
279
+ repo.git.show_ref("refs/heads/{:s}".format(branch), verify=True)
280
+ except git.exc.GitCommandError:
281
+ default_branch = repo.active_branch.name
282
+ warnings.warn(
283
+ "Requested branch: '{:s}' does not exist. Falling back to default branch: '{:s}'".format(
284
+ branch, default_branch
285
+ )
286
+ )
287
+
288
+ branch = default_branch
289
+
290
+ if not quiet and repo.git.version_info < (2, 31, 0):
291
+ print(
292
+ "Old Git version {:d}.{:d}.{:d} detected. There are optimizations available in version 2.31.0 which speed up performance".format(
293
+ *repo.git.version_info
294
+ )
295
+ )
296
+
297
+ if opt:
298
+ if not quiet:
299
+ print(
300
+ "Generating git commit-graph... If you wish, this file is deletable later at .git/objects/info"
301
+ )
302
+ repo.git.execute(
303
+ ["git", "commit-graph", "write", "--changed-paths"]
304
+ ) # repo.git.commit_graph('write --changed-paths') doesn't work for some reason
305
+
306
+ desc = "{:<55s}".format("Listing all commits")
307
+ for commit in tqdm(
308
+ repo.iter_commits(branch), desc=desc, unit=" Commits", **tqdm_args
309
+ ):
310
+ cohort = datetime.datetime.utcfromtimestamp(commit.committed_date).strftime(
311
+ cohortfm
312
+ )
313
+ commit2cohort[commit.binsha] = cohort
314
+ curve_key_tuples.add(("cohort", cohort))
315
+ if use_mailmap:
316
+ author_name, author_email = get_mailmap_author_name_email(
317
+ repo, commit.author.name, commit.author.email
318
+ )
319
+ else:
320
+ author_name, author_email = commit.author.name, commit.author.email
321
+ curve_key_tuples.add(("author", author_name))
322
+ curve_key_tuples.add(("domain", author_email.split("@")[-1]))
323
+
324
+ desc = "{:<55s}".format("Backtracking the master branch")
325
+ with tqdm(desc=desc, unit=" Commits", **tqdm_args) as bar:
326
+ commit = repo.head.commit
327
+ last_date = None
328
+ while True:
329
+ if last_date is None or commit.committed_date < last_date - interval:
330
+ master_commits.append(commit)
331
+ last_date = commit.committed_date
332
+ bar.update()
333
+ if not commit.parents:
334
+ break
335
+ commit = commit.parents[0]
336
+ del commit
337
+
338
+ if ignore and not only:
339
+ only = ["**"] # stupid fix
340
+ def_ft_str = "+({:s})".format("|".join(default_filetypes))
341
+ path_match_str = "{:s}|!+({:s})".format("|".join(only), "|".join(ignore))
342
+ path_match_zero = len(only) == 0 and len(ignore) == 0
343
+ ok_entry_paths = dict()
344
+ all_entries = []
345
+
346
+ def entry_path_ok(path):
347
+ # All this matching is slow so let's cache it
348
+ if path not in ok_entry_paths:
349
+ ok_entry_paths[path] = (
350
+ all_filetypes
351
+ or fnmatch.fnmatch(
352
+ os.path.split(path)[-1], def_ft_str, flags=fnmatch.EXTMATCH
353
+ )
354
+ ) and (
355
+ path_match_zero
356
+ or fnmatch.fnmatch(
357
+ path,
358
+ path_match_str,
359
+ flags=fnmatch.NEGATE | fnmatch.EXTMATCH | fnmatch.SPLIT,
360
+ )
361
+ )
362
+ return ok_entry_paths[path]
363
+
364
+ def get_entries(commit):
365
+ tmp = [
366
+ MiniEntry(entry)
367
+ for entry in commit.tree.traverse()
368
+ if entry.type == "blob" and entry_path_ok(entry.path)
369
+ ]
370
+ all_entries.append(tmp)
371
+ return tmp
372
+
373
+ master_commits = master_commits[::-1] # Reverse it so it's chnological ascending
374
+ entries_total = 0
375
+ desc = "{:<55s}".format("Discovering entries & caching filenames")
376
+ with tqdm(
377
+ desc="{:<55s}".format("Entries Discovered"),
378
+ unit=" Entries",
379
+ position=1,
380
+ **tqdm_args,
381
+ ) as bar:
382
+ for i, commit in enumerate(
383
+ tqdm(master_commits, desc=desc, unit=" Commits", position=0, **tqdm_args)
384
+ ):
385
+ for entry in get_entries(commit):
386
+ entries_total += 1
387
+ _, ext = os.path.splitext(entry.path)
388
+ curve_key_tuples.add(("ext", ext))
389
+ curve_key_tuples.add(("dir", get_top_dir(entry.path)))
390
+ bar.update()
391
+ master_commits[i] = MiniCommit(
392
+ commit
393
+ ) # Might have cached the entries, we don't want that
394
+
395
+ # We don't need these anymore, let GC Cleanup
396
+ del repo
397
+ del ok_entry_paths
398
+ del commit
399
+ # End GC Cleanup
400
+
401
+ curves = {} # multiple y axis, in the form key_tuple: Array[y-axis points]
402
+ ts = [] # x axis
403
+ last_file_y = (
404
+ {}
405
+ ) # Contributions of each individual file to each individual curve, when the file was last seen
406
+ cur_y = {} # Sum of all contributions between files towards each individual curve
407
+ blamer = BlameDriver(
408
+ repo_dir,
409
+ procs,
410
+ last_file_y,
411
+ cur_y,
412
+ blame_kwargs,
413
+ commit2cohort,
414
+ use_mailmap,
415
+ quiet,
416
+ )
417
+ commit_history = (
418
+ {}
419
+ ) # How many lines of a commit (by SHA) still exist at a given time
420
+ last_file_hash = {} # File SHAs when they were last seen
421
+
422
+ # Allow script to be paused and process count to change
423
+ def handler(a, b):
424
+ try:
425
+ blamer.pause()
426
+ print("\n\nProcess paused")
427
+ x = int(
428
+ input(
429
+ "0. Exit\n1. Continue\n2. Modify process count\nSelect an option: "
430
+ )
431
+ )
432
+
433
+ if x == 1:
434
+ return blamer.resume()
435
+ elif x == 2:
436
+ x = int(
437
+ input(
438
+ "\n\nCurrent Processes: {:d}\nNew Setting: ".format(
439
+ blamer.proc_count
440
+ )
441
+ )
442
+ )
443
+ if x > 0:
444
+ blamer.proc_count = x
445
+ blamer.spawn_process(spawn_only=True)
446
+ return blamer.resume()
447
+ os._exit(1) # sys.exit() does weird things
448
+ except:
449
+ pass
450
+ handler(None, None)
451
+
452
+ if not quiet:
453
+ safe_signal(signal.SIGINT, handler)
454
+
455
+ desc = "{:<55s}".format(
456
+ "Analyzing commit history with {:d} processes".format(procs)
457
+ )
458
+ with tqdm(
459
+ desc="{:<55s}".format("Entries Processed"),
460
+ total=entries_total,
461
+ unit=" Entries",
462
+ position=1,
463
+ maxinterval=1,
464
+ miniters=100,
465
+ **tqdm_args,
466
+ ) as bar:
467
+ cbar = tqdm(master_commits, desc=desc, unit=" Commits", position=0, **tqdm_args)
468
+ for commit in cbar:
469
+ t = datetime.datetime.utcfromtimestamp(commit.committed_date)
470
+ ts.append(t) # x axis
471
+
472
+ # START: Fast diff, to reduce no. of files checked via blame.
473
+ # File hashes are checked against previous iteration
474
+ entries = all_entries.pop(
475
+ 0
476
+ ) # all_entries grows smaller as curves grows larger
477
+
478
+ check_entries = []
479
+ cur_file_hash = {}
480
+ for entry in entries:
481
+ cur_file_hash[entry.path] = entry.binsha
482
+ if entry.path in last_file_hash:
483
+ if last_file_hash[entry.path] != entry.binsha: # Modified file
484
+ for key_tuple, count in last_file_y[entry.path].items():
485
+ cur_y[key_tuple] -= count
486
+ check_entries.append(entry)
487
+ else: # Identical file
488
+ bar.update()
489
+ del last_file_hash[
490
+ entry.path
491
+ ] # Identical/Modified file removed, leaving deleted files behind
492
+ else: # Newly added file
493
+ check_entries.append(entry)
494
+ for deleted_path in last_file_hash.keys(): # Deleted files
495
+ for key_tuple, count in last_file_y[deleted_path].items():
496
+ cur_y[key_tuple] -= count
497
+ last_file_hash = cur_file_hash
498
+ # END: Fast diff
499
+
500
+ # Multiprocess blame checker, updates cur_y & last_file_y
501
+ blamer.fetch(commit, check_entries, bar)
502
+ cbar.set_description(
503
+ "{:<55s}".format(
504
+ "Analyzing commit history with {:d} processes".format(
505
+ len(blamer.proc_pool)
506
+ )
507
+ ),
508
+ False,
509
+ )
510
+
511
+ for key_tuple, count in cur_y.items():
512
+ key_category, key = key_tuple
513
+ if key_category == "sha":
514
+ commit_history.setdefault(key, []).append(
515
+ (commit.committed_date, count)
516
+ )
517
+
518
+ for key_tuple in curve_key_tuples:
519
+ curves.setdefault(key_tuple, []).append(cur_y.get(key_tuple, 0))
520
+
521
+ safe_signal(signal.SIGINT, signal.default_int_handler)
522
+
523
+ results = {}
524
+
525
+ def get_data(key_type, label_fmt=lambda x: x):
526
+ key_items = sorted(k for t, k in curve_key_tuples if t == key_type)
527
+ return {
528
+ "y": [curves[(key_type, key_item)] for key_item in key_items],
529
+ "ts": [t.isoformat() for t in ts],
530
+ "labels": [label_fmt(key_item) for key_item in key_items],
531
+ }
532
+
533
+ results["cohorts"] = get_data("cohort", lambda c: "Code added in %s" % c)
534
+ results["exts"] = get_data("ext")
535
+ results["authors"] = get_data("author")
536
+ results["dirs"] = get_data("dir")
537
+ results["domains"] = get_data("domain")
538
+ results["survival"] = commit_history
539
+
540
+ if outdir:
541
+ for key in ["cohorts", "exts", "authors", "dirs", "domains"]:
542
+ fn = os.path.join(outdir, f"{key}.json")
543
+ if not quiet:
544
+ print("Writing data to %s" % fn)
545
+ with open(fn, "w") as f:
546
+ json.dump(results[key], f)
547
+
548
+ # Survival data
549
+ fn = os.path.join(outdir, "survival.json")
550
+ if not quiet:
551
+ print("Writing survival data to %s" % fn)
552
+ with open(fn, "w") as f:
553
+ json.dump(commit_history, f)
554
+
555
+ return results
556
+
557
+
558
+ @functools.lru_cache(maxsize=None)
559
+ def get_mailmap_author_name_email(repo, author_name, author_email):
560
+ pre_mailmap_author_email = f"{author_name} <{author_email}>"
561
+ mail_mapped_author_email: str = repo.git.check_mailmap(pre_mailmap_author_email)
562
+ mailmap_name, mailmap_email = mail_mapped_author_email[:-1].split(" <", maxsplit=1)
563
+ return mailmap_name, mailmap_email
564
+
565
+
566
+ def analyze_cmdline():
567
+ parser = argparse.ArgumentParser(description="Analyze git repo")
568
+ parser.add_argument(
569
+ "--cohortfm",
570
+ default="%Y",
571
+ type=str,
572
+ help='A Python datetime format string such as "%%Y" for creating cohorts (default: %(default)s)',
573
+ )
574
+ parser.add_argument(
575
+ "--interval",
576
+ default=7 * 24 * 60 * 60,
577
+ type=int,
578
+ help="Min difference between commits to analyze (default: %(default)ss)",
579
+ )
580
+ parser.add_argument(
581
+ "--ignore",
582
+ default=[],
583
+ action="append",
584
+ help="File patterns that should be ignored (can provide multiple, will each subtract independently). Uses glob syntax and generally needs to be shell escaped. For instance, to ignore a subdirectory `foo/`, run `git-of-theseus . --ignore 'foo/**'`.",
585
+ )
586
+ parser.add_argument(
587
+ "--only",
588
+ default=[],
589
+ action="append",
590
+ help="File patterns that can match. Multiple can be provided. If at least one is provided, every file has to match at least one. Uses glob syntax and typically has to be shell escaped. In order to analytize a subdirectory `bar/`, run `git-of-theseus . --only 'bar/**'`",
591
+ )
592
+ parser.add_argument(
593
+ "--outdir",
594
+ default=".",
595
+ help="Output directory to store results (default: %(default)s)",
596
+ )
597
+ parser.add_argument(
598
+ "--branch",
599
+ default="master",
600
+ type=str,
601
+ help="Branch to track (default: %(default)s)",
602
+ )
603
+ parser.add_argument(
604
+ "--ignore-whitespace",
605
+ default=[],
606
+ action="store_true",
607
+ help="Ignore whitespace changes when running git blame.",
608
+ )
609
+ parser.add_argument(
610
+ "--all-filetypes",
611
+ action="store_true",
612
+ help="Include all files (if not set then will only analyze %s"
613
+ % ",".join(default_filetypes),
614
+ )
615
+ parser.add_argument(
616
+ "--quiet",
617
+ action="store_true",
618
+ help="Disable all console output (default: %(default)s)",
619
+ )
620
+ parser.add_argument(
621
+ "--procs",
622
+ default=2,
623
+ type=int,
624
+ help="Number of processes to use. There is a point of diminishing returns, and RAM may become an issue on large repos (default: %(default)s)",
625
+ )
626
+ parser.add_argument(
627
+ "--opt",
628
+ action="store_true",
629
+ help="Generates git commit-graph; Improves performance at the cost of some (~80KB/kCommit) disk space (default: %(default)s)",
630
+ )
631
+ parser.add_argument("repo_dir")
632
+ kwargs = vars(parser.parse_args())
633
+
634
+ try:
635
+ analyze(**kwargs)
636
+ except KeyboardInterrupt:
637
+ exit(1)
638
+ except:
639
+ raise
640
+
641
+
642
+ if __name__ == "__main__":
643
+ analyze_cmdline()