better-git-of-theseus 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- better_git_of_theseus-0.4.0.dist-info/METADATA +122 -0
- better_git_of_theseus-0.4.0.dist-info/RECORD +15 -0
- better_git_of_theseus-0.4.0.dist-info/WHEEL +5 -0
- better_git_of_theseus-0.4.0.dist-info/entry_points.txt +6 -0
- better_git_of_theseus-0.4.0.dist-info/licenses/LICENSE +201 -0
- better_git_of_theseus-0.4.0.dist-info/top_level.txt +1 -0
- git_of_theseus/__init__.py +4 -0
- git_of_theseus/analyze.py +643 -0
- git_of_theseus/app.py +133 -0
- git_of_theseus/cmd.py +23 -0
- git_of_theseus/line_plot.py +102 -0
- git_of_theseus/plotly_plots.py +243 -0
- git_of_theseus/stack_plot.py +98 -0
- git_of_theseus/survival_plot.py +162 -0
- git_of_theseus/utils.py +33 -0
|
@@ -0,0 +1,643 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2016 Erik Bernhardsson
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import datetime
|
|
19
|
+
import functools
|
|
20
|
+
import json
|
|
21
|
+
import multiprocessing
|
|
22
|
+
import os
|
|
23
|
+
import signal
|
|
24
|
+
import warnings
|
|
25
|
+
import threading
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
import git
|
|
29
|
+
import pygments.lexers
|
|
30
|
+
from tqdm import tqdm
|
|
31
|
+
from wcmatch import fnmatch
|
|
32
|
+
|
|
33
|
+
def safe_signal(signum, handler):
|
|
34
|
+
try:
|
|
35
|
+
signal.signal(signum, handler)
|
|
36
|
+
except ValueError:
|
|
37
|
+
# Cannot set signal handler in non-main thread
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
# Some filetypes in Pygments are not necessarily computer code, but configuration/documentation. Let's not include those.
|
|
41
|
+
IGNORE_PYGMENTS_FILETYPES = [
|
|
42
|
+
"*.json",
|
|
43
|
+
"*.md",
|
|
44
|
+
"*.ps",
|
|
45
|
+
"*.eps",
|
|
46
|
+
"*.txt",
|
|
47
|
+
"*.xml",
|
|
48
|
+
"*.xsl",
|
|
49
|
+
"*.rss",
|
|
50
|
+
"*.xslt",
|
|
51
|
+
"*.xsd",
|
|
52
|
+
"*.wsdl",
|
|
53
|
+
"*.wsf",
|
|
54
|
+
"*.yaml",
|
|
55
|
+
"*.yml",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
default_filetypes = set()
|
|
59
|
+
for _, _, filetypes, _ in pygments.lexers.get_all_lexers():
|
|
60
|
+
default_filetypes.update(filetypes)
|
|
61
|
+
default_filetypes.difference_update(IGNORE_PYGMENTS_FILETYPES)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class MiniEntry:
|
|
65
|
+
def __init__(self, entry):
|
|
66
|
+
self.path = entry.path
|
|
67
|
+
self.binsha = entry.binsha
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class MiniCommit:
|
|
71
|
+
def __init__(self, commit):
|
|
72
|
+
self.hexsha = commit.hexsha
|
|
73
|
+
self.committed_date = commit.committed_date
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_top_dir(path):
|
|
77
|
+
return (
|
|
78
|
+
os.path.dirname(path).split("/")[0] + "/"
|
|
79
|
+
) # Git/GitPython on Windows also returns paths with '/'s
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class BlameProc(multiprocessing.Process):
|
|
83
|
+
def __init__(
|
|
84
|
+
self, repo_dir, q, ret_q, run_flag, blame_kwargs, commit2cohort, use_mailmap
|
|
85
|
+
):
|
|
86
|
+
super().__init__(daemon=True)
|
|
87
|
+
self.repo: git.Repo = git.Repo(repo_dir)
|
|
88
|
+
self.q: multiprocessing.Queue = q
|
|
89
|
+
self.ret_q: multiprocessing.Queue = ret_q
|
|
90
|
+
self.run_flag: multiprocessing.Event = run_flag
|
|
91
|
+
self.blame_kwargs = dict(blame_kwargs)
|
|
92
|
+
self.commit2cohort = commit2cohort # On Unix systems if process is started via the `fork` method, could make this a copy-on-write variable to save RAM
|
|
93
|
+
self.use_mailmap = use_mailmap
|
|
94
|
+
|
|
95
|
+
# Get Blame data for a `file` at `commit`
|
|
96
|
+
def get_file_histogram(self, path, commit):
|
|
97
|
+
h = {}
|
|
98
|
+
try:
|
|
99
|
+
for old_commit, lines in self.repo.blame(commit, path, **self.blame_kwargs):
|
|
100
|
+
cohort = self.commit2cohort.get(old_commit.binsha, "MISSING")
|
|
101
|
+
_, ext = os.path.splitext(path)
|
|
102
|
+
if self.use_mailmap:
|
|
103
|
+
author_name, author_email = get_mailmap_author_name_email(
|
|
104
|
+
self.repo, old_commit.author.name, old_commit.author.email
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
author_name, author_email = (
|
|
108
|
+
old_commit.author.name,
|
|
109
|
+
old_commit.author.email,
|
|
110
|
+
)
|
|
111
|
+
keys = [
|
|
112
|
+
("cohort", cohort),
|
|
113
|
+
("ext", ext),
|
|
114
|
+
("author", author_name),
|
|
115
|
+
("dir", get_top_dir(path)),
|
|
116
|
+
("domain", author_email.split("@")[-1]),
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
if old_commit.binsha in self.commit2cohort:
|
|
120
|
+
keys.append(("sha", old_commit.hexsha))
|
|
121
|
+
|
|
122
|
+
for key in keys:
|
|
123
|
+
h[key] = h.get(key, 0) + len(lines)
|
|
124
|
+
except:
|
|
125
|
+
pass
|
|
126
|
+
return h
|
|
127
|
+
|
|
128
|
+
def run(self):
|
|
129
|
+
safe_signal(signal.SIGINT, signal.SIG_IGN)
|
|
130
|
+
try:
|
|
131
|
+
while self.run_flag.wait():
|
|
132
|
+
entry, commit = self.q.get()
|
|
133
|
+
if not commit:
|
|
134
|
+
return
|
|
135
|
+
self.ret_q.put((entry, self.get_file_histogram(entry, commit)))
|
|
136
|
+
except:
|
|
137
|
+
raise
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class BlameDriver:
|
|
141
|
+
def __init__(
|
|
142
|
+
self,
|
|
143
|
+
repo_dir,
|
|
144
|
+
proc_count,
|
|
145
|
+
last_file_y,
|
|
146
|
+
cur_y,
|
|
147
|
+
blame_kwargs,
|
|
148
|
+
commit2cohort,
|
|
149
|
+
use_mailmap,
|
|
150
|
+
quiet,
|
|
151
|
+
):
|
|
152
|
+
self.repo_dir = repo_dir
|
|
153
|
+
self.proc_count = proc_count
|
|
154
|
+
self.q = multiprocessing.Queue()
|
|
155
|
+
self.ret_q = multiprocessing.Queue()
|
|
156
|
+
self.run_flag = multiprocessing.Event()
|
|
157
|
+
self.run_flag.set()
|
|
158
|
+
self.last_file_y = last_file_y
|
|
159
|
+
self.cur_y = cur_y
|
|
160
|
+
self.blame_kwargs = blame_kwargs
|
|
161
|
+
self.commit2cohort = commit2cohort
|
|
162
|
+
self.use_mailmap = use_mailmap
|
|
163
|
+
self.quiet = quiet
|
|
164
|
+
self.proc_pool = []
|
|
165
|
+
self.spawn_process(self.proc_count)
|
|
166
|
+
|
|
167
|
+
def spawn_process(self, spawn_only=False):
|
|
168
|
+
n = self.proc_count - len(self.proc_pool)
|
|
169
|
+
if n == 0:
|
|
170
|
+
return
|
|
171
|
+
if n < 0:
|
|
172
|
+
return None if spawn_only else self._despawn_process(-n)
|
|
173
|
+
if not self.quiet:
|
|
174
|
+
print("\n\nStarting up processes: ", end="")
|
|
175
|
+
for i in range(n):
|
|
176
|
+
self.proc_pool.append(
|
|
177
|
+
BlameProc(
|
|
178
|
+
self.repo_dir,
|
|
179
|
+
self.q,
|
|
180
|
+
self.ret_q,
|
|
181
|
+
self.run_flag,
|
|
182
|
+
self.blame_kwargs,
|
|
183
|
+
self.commit2cohort,
|
|
184
|
+
self.use_mailmap,
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
self.proc_pool[-1].start()
|
|
188
|
+
if not self.quiet:
|
|
189
|
+
print(
|
|
190
|
+
("" if i == 0 else ", ") + self.proc_pool[-1].name,
|
|
191
|
+
end="\n" if i == n - 1 else "",
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def _despawn_process(self, n):
|
|
195
|
+
for i in range(n):
|
|
196
|
+
self.q.put((None, None))
|
|
197
|
+
|
|
198
|
+
print("\n")
|
|
199
|
+
while True:
|
|
200
|
+
print("\rShutting down processes: ", end="")
|
|
201
|
+
killed_processes = 0
|
|
202
|
+
for idx, proc in enumerate(self.proc_pool):
|
|
203
|
+
if proc.is_alive():
|
|
204
|
+
continue
|
|
205
|
+
else:
|
|
206
|
+
print(
|
|
207
|
+
("" if killed_processes == 0 else ", ") + proc.name,
|
|
208
|
+
end="\n" if killed_processes == n - 1 else "",
|
|
209
|
+
)
|
|
210
|
+
killed_processes += 1
|
|
211
|
+
if killed_processes >= n:
|
|
212
|
+
for proc in self.proc_pool:
|
|
213
|
+
if not proc.is_alive():
|
|
214
|
+
proc.join()
|
|
215
|
+
self.proc_pool = [proc for proc in self.proc_pool if proc.is_alive()]
|
|
216
|
+
return
|
|
217
|
+
|
|
218
|
+
def fetch(self, commit, check_entries, bar):
|
|
219
|
+
self.spawn_process()
|
|
220
|
+
processed_entries = 0
|
|
221
|
+
total_entries = len(check_entries)
|
|
222
|
+
|
|
223
|
+
for entry in check_entries:
|
|
224
|
+
self.q.put((entry.path, commit.hexsha))
|
|
225
|
+
|
|
226
|
+
while processed_entries < total_entries:
|
|
227
|
+
path, file_y = self.ret_q.get()
|
|
228
|
+
|
|
229
|
+
for key_tuple, file_locs in file_y.items():
|
|
230
|
+
self.cur_y[key_tuple] = self.cur_y.get(key_tuple, 0) + file_locs
|
|
231
|
+
self.last_file_y[path] = file_y
|
|
232
|
+
|
|
233
|
+
processed_entries += 1
|
|
234
|
+
self.run_flag.wait()
|
|
235
|
+
bar.update()
|
|
236
|
+
|
|
237
|
+
return self.cur_y
|
|
238
|
+
|
|
239
|
+
def pause(self):
|
|
240
|
+
self.run_flag.clear()
|
|
241
|
+
|
|
242
|
+
def resume(self):
|
|
243
|
+
self.run_flag.set()
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def analyze(
|
|
247
|
+
repo_dir,
|
|
248
|
+
cohortfm="%Y",
|
|
249
|
+
interval=7 * 24 * 60 * 60,
|
|
250
|
+
ignore=[],
|
|
251
|
+
only=[],
|
|
252
|
+
outdir=".",
|
|
253
|
+
branch="master",
|
|
254
|
+
all_filetypes=False,
|
|
255
|
+
ignore_whitespace=False,
|
|
256
|
+
procs=2,
|
|
257
|
+
quiet=False,
|
|
258
|
+
opt=False,
|
|
259
|
+
):
|
|
260
|
+
use_mailmap = (Path(repo_dir) / ".mailmap").exists()
|
|
261
|
+
repo = git.Repo(repo_dir)
|
|
262
|
+
blame_kwargs = {}
|
|
263
|
+
if ignore_whitespace:
|
|
264
|
+
blame_kwargs["w"] = True
|
|
265
|
+
master_commits = [] # only stores a subset
|
|
266
|
+
commit2cohort = {}
|
|
267
|
+
curve_key_tuples = set() # Keys of each curve that will be tracked
|
|
268
|
+
tqdm_args = {
|
|
269
|
+
"smoothing": 0.025, # Exponential smoothing is still rather jumpy, a tiny number will do
|
|
270
|
+
"disable": quiet,
|
|
271
|
+
"dynamic_ncols": True,
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if outdir and not os.path.exists(outdir):
|
|
275
|
+
os.makedirs(outdir)
|
|
276
|
+
|
|
277
|
+
# Check if specified branch exists
|
|
278
|
+
try:
|
|
279
|
+
repo.git.show_ref("refs/heads/{:s}".format(branch), verify=True)
|
|
280
|
+
except git.exc.GitCommandError:
|
|
281
|
+
default_branch = repo.active_branch.name
|
|
282
|
+
warnings.warn(
|
|
283
|
+
"Requested branch: '{:s}' does not exist. Falling back to default branch: '{:s}'".format(
|
|
284
|
+
branch, default_branch
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
branch = default_branch
|
|
289
|
+
|
|
290
|
+
if not quiet and repo.git.version_info < (2, 31, 0):
|
|
291
|
+
print(
|
|
292
|
+
"Old Git version {:d}.{:d}.{:d} detected. There are optimizations available in version 2.31.0 which speed up performance".format(
|
|
293
|
+
*repo.git.version_info
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if opt:
|
|
298
|
+
if not quiet:
|
|
299
|
+
print(
|
|
300
|
+
"Generating git commit-graph... If you wish, this file is deletable later at .git/objects/info"
|
|
301
|
+
)
|
|
302
|
+
repo.git.execute(
|
|
303
|
+
["git", "commit-graph", "write", "--changed-paths"]
|
|
304
|
+
) # repo.git.commit_graph('write --changed-paths') doesn't work for some reason
|
|
305
|
+
|
|
306
|
+
desc = "{:<55s}".format("Listing all commits")
|
|
307
|
+
for commit in tqdm(
|
|
308
|
+
repo.iter_commits(branch), desc=desc, unit=" Commits", **tqdm_args
|
|
309
|
+
):
|
|
310
|
+
cohort = datetime.datetime.utcfromtimestamp(commit.committed_date).strftime(
|
|
311
|
+
cohortfm
|
|
312
|
+
)
|
|
313
|
+
commit2cohort[commit.binsha] = cohort
|
|
314
|
+
curve_key_tuples.add(("cohort", cohort))
|
|
315
|
+
if use_mailmap:
|
|
316
|
+
author_name, author_email = get_mailmap_author_name_email(
|
|
317
|
+
repo, commit.author.name, commit.author.email
|
|
318
|
+
)
|
|
319
|
+
else:
|
|
320
|
+
author_name, author_email = commit.author.name, commit.author.email
|
|
321
|
+
curve_key_tuples.add(("author", author_name))
|
|
322
|
+
curve_key_tuples.add(("domain", author_email.split("@")[-1]))
|
|
323
|
+
|
|
324
|
+
desc = "{:<55s}".format("Backtracking the master branch")
|
|
325
|
+
with tqdm(desc=desc, unit=" Commits", **tqdm_args) as bar:
|
|
326
|
+
commit = repo.head.commit
|
|
327
|
+
last_date = None
|
|
328
|
+
while True:
|
|
329
|
+
if last_date is None or commit.committed_date < last_date - interval:
|
|
330
|
+
master_commits.append(commit)
|
|
331
|
+
last_date = commit.committed_date
|
|
332
|
+
bar.update()
|
|
333
|
+
if not commit.parents:
|
|
334
|
+
break
|
|
335
|
+
commit = commit.parents[0]
|
|
336
|
+
del commit
|
|
337
|
+
|
|
338
|
+
if ignore and not only:
|
|
339
|
+
only = ["**"] # stupid fix
|
|
340
|
+
def_ft_str = "+({:s})".format("|".join(default_filetypes))
|
|
341
|
+
path_match_str = "{:s}|!+({:s})".format("|".join(only), "|".join(ignore))
|
|
342
|
+
path_match_zero = len(only) == 0 and len(ignore) == 0
|
|
343
|
+
ok_entry_paths = dict()
|
|
344
|
+
all_entries = []
|
|
345
|
+
|
|
346
|
+
def entry_path_ok(path):
|
|
347
|
+
# All this matching is slow so let's cache it
|
|
348
|
+
if path not in ok_entry_paths:
|
|
349
|
+
ok_entry_paths[path] = (
|
|
350
|
+
all_filetypes
|
|
351
|
+
or fnmatch.fnmatch(
|
|
352
|
+
os.path.split(path)[-1], def_ft_str, flags=fnmatch.EXTMATCH
|
|
353
|
+
)
|
|
354
|
+
) and (
|
|
355
|
+
path_match_zero
|
|
356
|
+
or fnmatch.fnmatch(
|
|
357
|
+
path,
|
|
358
|
+
path_match_str,
|
|
359
|
+
flags=fnmatch.NEGATE | fnmatch.EXTMATCH | fnmatch.SPLIT,
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
return ok_entry_paths[path]
|
|
363
|
+
|
|
364
|
+
def get_entries(commit):
|
|
365
|
+
tmp = [
|
|
366
|
+
MiniEntry(entry)
|
|
367
|
+
for entry in commit.tree.traverse()
|
|
368
|
+
if entry.type == "blob" and entry_path_ok(entry.path)
|
|
369
|
+
]
|
|
370
|
+
all_entries.append(tmp)
|
|
371
|
+
return tmp
|
|
372
|
+
|
|
373
|
+
master_commits = master_commits[::-1] # Reverse it so it's chnological ascending
|
|
374
|
+
entries_total = 0
|
|
375
|
+
desc = "{:<55s}".format("Discovering entries & caching filenames")
|
|
376
|
+
with tqdm(
|
|
377
|
+
desc="{:<55s}".format("Entries Discovered"),
|
|
378
|
+
unit=" Entries",
|
|
379
|
+
position=1,
|
|
380
|
+
**tqdm_args,
|
|
381
|
+
) as bar:
|
|
382
|
+
for i, commit in enumerate(
|
|
383
|
+
tqdm(master_commits, desc=desc, unit=" Commits", position=0, **tqdm_args)
|
|
384
|
+
):
|
|
385
|
+
for entry in get_entries(commit):
|
|
386
|
+
entries_total += 1
|
|
387
|
+
_, ext = os.path.splitext(entry.path)
|
|
388
|
+
curve_key_tuples.add(("ext", ext))
|
|
389
|
+
curve_key_tuples.add(("dir", get_top_dir(entry.path)))
|
|
390
|
+
bar.update()
|
|
391
|
+
master_commits[i] = MiniCommit(
|
|
392
|
+
commit
|
|
393
|
+
) # Might have cached the entries, we don't want that
|
|
394
|
+
|
|
395
|
+
# We don't need these anymore, let GC Cleanup
|
|
396
|
+
del repo
|
|
397
|
+
del ok_entry_paths
|
|
398
|
+
del commit
|
|
399
|
+
# End GC Cleanup
|
|
400
|
+
|
|
401
|
+
curves = {} # multiple y axis, in the form key_tuple: Array[y-axis points]
|
|
402
|
+
ts = [] # x axis
|
|
403
|
+
last_file_y = (
|
|
404
|
+
{}
|
|
405
|
+
) # Contributions of each individual file to each individual curve, when the file was last seen
|
|
406
|
+
cur_y = {} # Sum of all contributions between files towards each individual curve
|
|
407
|
+
blamer = BlameDriver(
|
|
408
|
+
repo_dir,
|
|
409
|
+
procs,
|
|
410
|
+
last_file_y,
|
|
411
|
+
cur_y,
|
|
412
|
+
blame_kwargs,
|
|
413
|
+
commit2cohort,
|
|
414
|
+
use_mailmap,
|
|
415
|
+
quiet,
|
|
416
|
+
)
|
|
417
|
+
commit_history = (
|
|
418
|
+
{}
|
|
419
|
+
) # How many lines of a commit (by SHA) still exist at a given time
|
|
420
|
+
last_file_hash = {} # File SHAs when they were last seen
|
|
421
|
+
|
|
422
|
+
# Allow script to be paused and process count to change
|
|
423
|
+
def handler(a, b):
|
|
424
|
+
try:
|
|
425
|
+
blamer.pause()
|
|
426
|
+
print("\n\nProcess paused")
|
|
427
|
+
x = int(
|
|
428
|
+
input(
|
|
429
|
+
"0. Exit\n1. Continue\n2. Modify process count\nSelect an option: "
|
|
430
|
+
)
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
if x == 1:
|
|
434
|
+
return blamer.resume()
|
|
435
|
+
elif x == 2:
|
|
436
|
+
x = int(
|
|
437
|
+
input(
|
|
438
|
+
"\n\nCurrent Processes: {:d}\nNew Setting: ".format(
|
|
439
|
+
blamer.proc_count
|
|
440
|
+
)
|
|
441
|
+
)
|
|
442
|
+
)
|
|
443
|
+
if x > 0:
|
|
444
|
+
blamer.proc_count = x
|
|
445
|
+
blamer.spawn_process(spawn_only=True)
|
|
446
|
+
return blamer.resume()
|
|
447
|
+
os._exit(1) # sys.exit() does weird things
|
|
448
|
+
except:
|
|
449
|
+
pass
|
|
450
|
+
handler(None, None)
|
|
451
|
+
|
|
452
|
+
if not quiet:
|
|
453
|
+
safe_signal(signal.SIGINT, handler)
|
|
454
|
+
|
|
455
|
+
desc = "{:<55s}".format(
|
|
456
|
+
"Analyzing commit history with {:d} processes".format(procs)
|
|
457
|
+
)
|
|
458
|
+
with tqdm(
|
|
459
|
+
desc="{:<55s}".format("Entries Processed"),
|
|
460
|
+
total=entries_total,
|
|
461
|
+
unit=" Entries",
|
|
462
|
+
position=1,
|
|
463
|
+
maxinterval=1,
|
|
464
|
+
miniters=100,
|
|
465
|
+
**tqdm_args,
|
|
466
|
+
) as bar:
|
|
467
|
+
cbar = tqdm(master_commits, desc=desc, unit=" Commits", position=0, **tqdm_args)
|
|
468
|
+
for commit in cbar:
|
|
469
|
+
t = datetime.datetime.utcfromtimestamp(commit.committed_date)
|
|
470
|
+
ts.append(t) # x axis
|
|
471
|
+
|
|
472
|
+
# START: Fast diff, to reduce no. of files checked via blame.
|
|
473
|
+
# File hashes are checked against previous iteration
|
|
474
|
+
entries = all_entries.pop(
|
|
475
|
+
0
|
|
476
|
+
) # all_entries grows smaller as curves grows larger
|
|
477
|
+
|
|
478
|
+
check_entries = []
|
|
479
|
+
cur_file_hash = {}
|
|
480
|
+
for entry in entries:
|
|
481
|
+
cur_file_hash[entry.path] = entry.binsha
|
|
482
|
+
if entry.path in last_file_hash:
|
|
483
|
+
if last_file_hash[entry.path] != entry.binsha: # Modified file
|
|
484
|
+
for key_tuple, count in last_file_y[entry.path].items():
|
|
485
|
+
cur_y[key_tuple] -= count
|
|
486
|
+
check_entries.append(entry)
|
|
487
|
+
else: # Identical file
|
|
488
|
+
bar.update()
|
|
489
|
+
del last_file_hash[
|
|
490
|
+
entry.path
|
|
491
|
+
] # Identical/Modified file removed, leaving deleted files behind
|
|
492
|
+
else: # Newly added file
|
|
493
|
+
check_entries.append(entry)
|
|
494
|
+
for deleted_path in last_file_hash.keys(): # Deleted files
|
|
495
|
+
for key_tuple, count in last_file_y[deleted_path].items():
|
|
496
|
+
cur_y[key_tuple] -= count
|
|
497
|
+
last_file_hash = cur_file_hash
|
|
498
|
+
# END: Fast diff
|
|
499
|
+
|
|
500
|
+
# Multiprocess blame checker, updates cur_y & last_file_y
|
|
501
|
+
blamer.fetch(commit, check_entries, bar)
|
|
502
|
+
cbar.set_description(
|
|
503
|
+
"{:<55s}".format(
|
|
504
|
+
"Analyzing commit history with {:d} processes".format(
|
|
505
|
+
len(blamer.proc_pool)
|
|
506
|
+
)
|
|
507
|
+
),
|
|
508
|
+
False,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
for key_tuple, count in cur_y.items():
|
|
512
|
+
key_category, key = key_tuple
|
|
513
|
+
if key_category == "sha":
|
|
514
|
+
commit_history.setdefault(key, []).append(
|
|
515
|
+
(commit.committed_date, count)
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
for key_tuple in curve_key_tuples:
|
|
519
|
+
curves.setdefault(key_tuple, []).append(cur_y.get(key_tuple, 0))
|
|
520
|
+
|
|
521
|
+
safe_signal(signal.SIGINT, signal.default_int_handler)
|
|
522
|
+
|
|
523
|
+
results = {}
|
|
524
|
+
|
|
525
|
+
def get_data(key_type, label_fmt=lambda x: x):
|
|
526
|
+
key_items = sorted(k for t, k in curve_key_tuples if t == key_type)
|
|
527
|
+
return {
|
|
528
|
+
"y": [curves[(key_type, key_item)] for key_item in key_items],
|
|
529
|
+
"ts": [t.isoformat() for t in ts],
|
|
530
|
+
"labels": [label_fmt(key_item) for key_item in key_items],
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
results["cohorts"] = get_data("cohort", lambda c: "Code added in %s" % c)
|
|
534
|
+
results["exts"] = get_data("ext")
|
|
535
|
+
results["authors"] = get_data("author")
|
|
536
|
+
results["dirs"] = get_data("dir")
|
|
537
|
+
results["domains"] = get_data("domain")
|
|
538
|
+
results["survival"] = commit_history
|
|
539
|
+
|
|
540
|
+
if outdir:
|
|
541
|
+
for key in ["cohorts", "exts", "authors", "dirs", "domains"]:
|
|
542
|
+
fn = os.path.join(outdir, f"{key}.json")
|
|
543
|
+
if not quiet:
|
|
544
|
+
print("Writing data to %s" % fn)
|
|
545
|
+
with open(fn, "w") as f:
|
|
546
|
+
json.dump(results[key], f)
|
|
547
|
+
|
|
548
|
+
# Survival data
|
|
549
|
+
fn = os.path.join(outdir, "survival.json")
|
|
550
|
+
if not quiet:
|
|
551
|
+
print("Writing survival data to %s" % fn)
|
|
552
|
+
with open(fn, "w") as f:
|
|
553
|
+
json.dump(commit_history, f)
|
|
554
|
+
|
|
555
|
+
return results
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
@functools.lru_cache(maxsize=None)
|
|
559
|
+
def get_mailmap_author_name_email(repo, author_name, author_email):
|
|
560
|
+
pre_mailmap_author_email = f"{author_name} <{author_email}>"
|
|
561
|
+
mail_mapped_author_email: str = repo.git.check_mailmap(pre_mailmap_author_email)
|
|
562
|
+
mailmap_name, mailmap_email = mail_mapped_author_email[:-1].split(" <", maxsplit=1)
|
|
563
|
+
return mailmap_name, mailmap_email
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def analyze_cmdline():
|
|
567
|
+
parser = argparse.ArgumentParser(description="Analyze git repo")
|
|
568
|
+
parser.add_argument(
|
|
569
|
+
"--cohortfm",
|
|
570
|
+
default="%Y",
|
|
571
|
+
type=str,
|
|
572
|
+
help='A Python datetime format string such as "%%Y" for creating cohorts (default: %(default)s)',
|
|
573
|
+
)
|
|
574
|
+
parser.add_argument(
|
|
575
|
+
"--interval",
|
|
576
|
+
default=7 * 24 * 60 * 60,
|
|
577
|
+
type=int,
|
|
578
|
+
help="Min difference between commits to analyze (default: %(default)ss)",
|
|
579
|
+
)
|
|
580
|
+
parser.add_argument(
|
|
581
|
+
"--ignore",
|
|
582
|
+
default=[],
|
|
583
|
+
action="append",
|
|
584
|
+
help="File patterns that should be ignored (can provide multiple, will each subtract independently). Uses glob syntax and generally needs to be shell escaped. For instance, to ignore a subdirectory `foo/`, run `git-of-theseus . --ignore 'foo/**'`.",
|
|
585
|
+
)
|
|
586
|
+
parser.add_argument(
|
|
587
|
+
"--only",
|
|
588
|
+
default=[],
|
|
589
|
+
action="append",
|
|
590
|
+
help="File patterns that can match. Multiple can be provided. If at least one is provided, every file has to match at least one. Uses glob syntax and typically has to be shell escaped. In order to analytize a subdirectory `bar/`, run `git-of-theseus . --only 'bar/**'`",
|
|
591
|
+
)
|
|
592
|
+
parser.add_argument(
|
|
593
|
+
"--outdir",
|
|
594
|
+
default=".",
|
|
595
|
+
help="Output directory to store results (default: %(default)s)",
|
|
596
|
+
)
|
|
597
|
+
parser.add_argument(
|
|
598
|
+
"--branch",
|
|
599
|
+
default="master",
|
|
600
|
+
type=str,
|
|
601
|
+
help="Branch to track (default: %(default)s)",
|
|
602
|
+
)
|
|
603
|
+
parser.add_argument(
|
|
604
|
+
"--ignore-whitespace",
|
|
605
|
+
default=[],
|
|
606
|
+
action="store_true",
|
|
607
|
+
help="Ignore whitespace changes when running git blame.",
|
|
608
|
+
)
|
|
609
|
+
parser.add_argument(
|
|
610
|
+
"--all-filetypes",
|
|
611
|
+
action="store_true",
|
|
612
|
+
help="Include all files (if not set then will only analyze %s"
|
|
613
|
+
% ",".join(default_filetypes),
|
|
614
|
+
)
|
|
615
|
+
parser.add_argument(
|
|
616
|
+
"--quiet",
|
|
617
|
+
action="store_true",
|
|
618
|
+
help="Disable all console output (default: %(default)s)",
|
|
619
|
+
)
|
|
620
|
+
parser.add_argument(
|
|
621
|
+
"--procs",
|
|
622
|
+
default=2,
|
|
623
|
+
type=int,
|
|
624
|
+
help="Number of processes to use. There is a point of diminishing returns, and RAM may become an issue on large repos (default: %(default)s)",
|
|
625
|
+
)
|
|
626
|
+
parser.add_argument(
|
|
627
|
+
"--opt",
|
|
628
|
+
action="store_true",
|
|
629
|
+
help="Generates git commit-graph; Improves performance at the cost of some (~80KB/kCommit) disk space (default: %(default)s)",
|
|
630
|
+
)
|
|
631
|
+
parser.add_argument("repo_dir")
|
|
632
|
+
kwargs = vars(parser.parse_args())
|
|
633
|
+
|
|
634
|
+
try:
|
|
635
|
+
analyze(**kwargs)
|
|
636
|
+
except KeyboardInterrupt:
|
|
637
|
+
exit(1)
|
|
638
|
+
except:
|
|
639
|
+
raise
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
if __name__ == "__main__":
|
|
643
|
+
analyze_cmdline()
|