mongodb-livedata-server 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4005 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- """
4
- git-filter-repo filters git repositories, similar to git filter-branch, BFG
5
- repo cleaner, and others. The basic idea is that it works by running
6
- git fast-export <options> | filter | git fast-import <options>
7
- where this program not only launches the whole pipeline but also serves as
8
- the 'filter' in the middle. It does a few additional things on top as well
9
- in order to make it into a well-rounded filtering tool.
10
-
11
- git-filter-repo can also be used as a library for more involved filtering
12
- operations; however:
13
- ***** API BACKWARD COMPATIBILITY CAVEAT *****
14
- Programs using git-filter-repo as a library can reach pretty far into its
15
- internals, but I am not prepared to guarantee backward compatibility of
16
- all APIs. I suspect changes will be rare, but I reserve the right to
17
- change any API. Since it is assumed that repository filtering is
18
- something one would do very rarely, and in particular that it's a
19
- one-shot operation, this should not be a problem in practice for anyone.
20
- However, if you want to re-use a program you have written that uses
21
- git-filter-repo as a library (or makes use of one of its --*-callback
22
- arguments), you should either make sure you are using the same version of
23
- git and git-filter-repo, or make sure to re-test it.
24
-
25
- If there are particular pieces of the API you are concerned about, and
26
- there is not already a testcase for it in t9391-lib-usage.sh or
27
- t9392-python-callback.sh, please contribute a testcase. That will not
28
- prevent me from changing the API, but it will allow you to look at the
29
- history of a testcase to see whether and how the API changed.
30
- ***** END API BACKWARD COMPATIBILITY CAVEAT *****
31
- """
32
-
33
- import argparse
34
- import collections
35
- import fnmatch
36
- import gettext
37
- import io
38
- import os
39
- import platform
40
- import re
41
- import shutil
42
- import subprocess
43
- import sys
44
- import time
45
- import textwrap
46
-
47
- from datetime import tzinfo, timedelta, datetime
48
-
49
- __all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress",
50
- "Checkpoint", "FastExportParser", "ProgressWriter",
51
- "string_to_date", "date_to_string",
52
- "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"]
53
-
54
- deleted_hash = b'0'*40
55
- write_marks = True
56
- date_format_permissive = True
57
-
58
- def gettext_poison(msg):
59
- if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover
60
- return "# GETTEXT POISON #"
61
- return gettext.gettext(msg)
62
-
63
- _ = gettext_poison
64
-
65
- def setup_gettext():
66
- TEXTDOMAIN="git-filter-repo"
67
- podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@"
68
- if not os.path.isdir(podir): # pragma: no cover
69
- podir = None # Python has its own fallback; use that
70
-
71
- ## This looks like the most straightforward translation of the relevant
72
- ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm:
73
- #import locale
74
- #locale.setlocale(locale.LC_MESSAGES, "");
75
- #locale.setlocale(locale.LC_TIME, "");
76
- #locale.textdomain(TEXTDOMAIN);
77
- #locale.bindtextdomain(TEXTDOMAIN, podir);
78
- ## but the python docs suggest using the gettext module (which doesn't
79
- ## have setlocale()) instead, so:
80
- gettext.textdomain(TEXTDOMAIN);
81
- gettext.bindtextdomain(TEXTDOMAIN, podir);
82
-
83
- def _timedelta_to_seconds(delta):
84
- """
85
- Converts timedelta to seconds
86
- """
87
- offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000
88
- return round(offset)
89
-
90
- class FixedTimeZone(tzinfo):
91
- """
92
- Fixed offset in minutes east from UTC.
93
- """
94
-
95
- tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$')
96
-
97
- def __init__(self, offset_string):
98
- tzinfo.__init__(self)
99
- sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups()
100
- factor = -1 if (sign and sign == b'-') else 1
101
- self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm)))
102
- self._offset_string = offset_string
103
-
104
- def utcoffset(self, dt):
105
- return self._offset
106
-
107
- def tzname(self, dt):
108
- return self._offset_string
109
-
110
- def dst(self, dt):
111
- return timedelta(0)
112
-
113
- def string_to_date(datestring):
114
- (unix_timestamp, tz_offset) = datestring.split()
115
- return datetime.fromtimestamp(int(unix_timestamp),
116
- FixedTimeZone(tz_offset))
117
-
118
- def date_to_string(dateobj):
119
- epoch = datetime.fromtimestamp(0, dateobj.tzinfo)
120
- return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)),
121
- dateobj.tzinfo.tzname(0)))
122
-
123
- def decode(bytestr):
124
- 'Try to convert bytestr to utf-8 for outputting as an error message.'
125
- return bytestr.decode('utf-8', 'backslashreplace')
126
-
127
- def glob_to_regex(glob_bytestr):
128
- 'Translate glob_bytestr into a regex on bytestrings'
129
-
130
- # fnmatch.translate is idiotic and won't accept bytestrings
131
- if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover
132
- raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr))
133
-
134
- # Create regex operating on string
135
- regex = fnmatch.translate(decode(glob_bytestr))
136
-
137
- # FIXME: This is an ugly hack...
138
- # fnmatch.translate tries to do multi-line matching and wants the glob to
139
- # match up to the end of the input, which isn't relevant for us, so we
140
- # have to modify the regex. fnmatch.translate has used different regex
141
- # constructs to achieve this with different python versions, so we have
142
- # to check for each of them and then fix it up. It would be much better
143
- # if fnmatch.translate could just take some flags to allow us to specify
144
- # what we want rather than employing this hackery, but since it
145
- # doesn't...
146
- if regex.endswith(r'\Z(?ms)'): # pragma: no cover
147
- regex = regex[0:-7]
148
- elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover
149
- regex = regex[4:-3]
150
-
151
- # Finally, convert back to regex operating on bytestr
152
- return regex.encode()
153
-
154
- class PathQuoting:
155
- _unescape = {b'a': b'\a',
156
- b'b': b'\b',
157
- b'f': b'\f',
158
- b'n': b'\n',
159
- b'r': b'\r',
160
- b't': b'\t',
161
- b'v': b'\v',
162
- b'"': b'"',
163
- b'\\':b'\\'}
164
- _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})')
165
- _escape = [bytes([x]) for x in range(127)]+[
166
- b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)]
167
- _reverse = dict(map(reversed, _unescape.items()))
168
- for x in _reverse:
169
- _escape[ord(x)] = b'\\'+_reverse[x]
170
- _special_chars = [len(x) > 1 for x in _escape]
171
-
172
- @staticmethod
173
- def unescape_sequence(orig):
174
- seq = orig.group(1)
175
- return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)])
176
-
177
- @staticmethod
178
- def dequote(quoted_string):
179
- if quoted_string.startswith(b'"'):
180
- assert quoted_string.endswith(b'"')
181
- return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence,
182
- quoted_string[1:-1])
183
- return quoted_string
184
-
185
- @staticmethod
186
- def enquote(unquoted_string):
187
- # Option 1: Quoting when fast-export would:
188
- # pqsc = PathQuoting._special_chars
189
- # if any(pqsc[x] for x in set(unquoted_string)):
190
- # Option 2, perf hack: do minimal amount of quoting required by fast-import
191
- if unquoted_string.startswith(b'"') or b'\n' in unquoted_string:
192
- pqe = PathQuoting._escape
193
- return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"'
194
- return unquoted_string
195
-
196
- class AncestryGraph(object):
197
- """
198
- A class that maintains a direct acycle graph of commits for the purpose of
199
- determining if one commit is the ancestor of another.
200
- """
201
-
202
- def __init__(self):
203
- self.cur_value = 0
204
-
205
- # A mapping from the external identifers given to us to the simple integers
206
- # we use in self.graph
207
- self.value = {}
208
-
209
- # A tuple of (depth, list-of-ancestors). Values and keys in this graph are
210
- # all integers from the self.value dict. The depth of a commit is one more
211
- # than the max depth of any of its ancestors.
212
- self.graph = {}
213
-
214
- # Cached results from previous calls to is_ancestor().
215
- self._cached_is_ancestor = {}
216
-
217
- def record_external_commits(self, external_commits):
218
- """
219
- Record in graph that each commit in external_commits exists, and is
220
- treated as a root commit with no parents.
221
- """
222
- for c in external_commits:
223
- if c not in self.value:
224
- self.cur_value += 1
225
- self.value[c] = self.cur_value
226
- self.graph[self.cur_value] = (1, [])
227
-
228
- def add_commit_and_parents(self, commit, parents):
229
- """
230
- Record in graph that commit has the given parents. parents _MUST_ have
231
- been first recorded. commit _MUST_ not have been recorded yet.
232
- """
233
- assert all(p in self.value for p in parents)
234
- assert commit not in self.value
235
-
236
- # Get values for commit and parents
237
- self.cur_value += 1
238
- self.value[commit] = self.cur_value
239
- graph_parents = [self.value[x] for x in parents]
240
-
241
- # Determine depth for commit, then insert the info into the graph
242
- depth = 1
243
- if parents:
244
- depth += max(self.graph[p][0] for p in graph_parents)
245
- self.graph[self.cur_value] = (depth, graph_parents)
246
-
247
- def is_ancestor(self, possible_ancestor, check):
248
- """
249
- Return whether possible_ancestor is an ancestor of check
250
- """
251
- a, b = self.value[possible_ancestor], self.value[check]
252
- original_pair = (a,b)
253
- a_depth = self.graph[a][0]
254
- ancestors = [b]
255
- visited = set()
256
- while ancestors:
257
- ancestor = ancestors.pop()
258
- prev_pair = (a, ancestor)
259
- if prev_pair in self._cached_is_ancestor:
260
- if not self._cached_is_ancestor[prev_pair]:
261
- continue
262
- self._cached_is_ancestor[original_pair] = True
263
- return True
264
- if ancestor in visited:
265
- continue
266
- visited.add(ancestor)
267
- depth, more_ancestors = self.graph[ancestor]
268
- if ancestor == a:
269
- self._cached_is_ancestor[original_pair] = True
270
- return True
271
- elif depth <= a_depth:
272
- continue
273
- ancestors.extend(more_ancestors)
274
- self._cached_is_ancestor[original_pair] = False
275
- return False
276
-
277
- class MailmapInfo(object):
278
- def __init__(self, filename):
279
- self.changes = {}
280
- self._parse_file(filename)
281
-
282
- def _parse_file(self, filename):
283
- name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*')
284
- comment_re = re.compile(br'\s*#.*')
285
- if not os.access(filename, os.R_OK):
286
- raise SystemExit(_("Cannot read %s") % decode(filename))
287
- with open(filename, 'br') as f:
288
- count = 0
289
- for line in f:
290
- count += 1
291
- err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line)
292
- # Remove comments
293
- line = comment_re.sub(b'', line)
294
- # Remove leading and trailing whitespace
295
- line = line.strip()
296
- if not line:
297
- continue
298
-
299
- m = name_and_email_re.match(line)
300
- if not m:
301
- raise SystemExit(err)
302
- proper_name, proper_email = m.groups()
303
- if len(line) == m.end():
304
- self.changes[(None, proper_email)] = (proper_name, proper_email)
305
- continue
306
- rest = line[m.end():]
307
- m = name_and_email_re.match(rest)
308
- if m:
309
- commit_name, commit_email = m.groups()
310
- if len(rest) != m.end():
311
- raise SystemExit(err)
312
- else:
313
- commit_name, commit_email = rest, None
314
- self.changes[(commit_name, commit_email)] = (proper_name, proper_email)
315
-
316
- def translate(self, name, email):
317
- ''' Given a name and email, return the expected new name and email from the
318
- mailmap if there is a translation rule for it, otherwise just return
319
- the given name and email.'''
320
- for old, new in self.changes.items():
321
- old_name, old_email = old
322
- new_name, new_email = new
323
- if (old_email is None or email.lower() == old_email.lower()) and (
324
- name == old_name or not old_name):
325
- return (new_name or name, new_email or email)
326
- return (name, email)
327
-
328
- class ProgressWriter(object):
329
- def __init__(self):
330
- self._last_progress_update = time.time()
331
- self._last_message = None
332
-
333
- def show(self, msg):
334
- self._last_message = msg
335
- now = time.time()
336
- if now - self._last_progress_update > .1:
337
- self._last_progress_update = now
338
- sys.stdout.write("\r{}".format(msg))
339
- sys.stdout.flush()
340
-
341
- def finish(self):
342
- self._last_progress_update = 0
343
- if self._last_message:
344
- self.show(self._last_message)
345
- sys.stdout.write("\n")
346
-
347
- class _IDs(object):
348
- """
349
- A class that maintains the 'name domain' of all the 'marks' (short int
350
- id for a blob/commit git object). The reason this mechanism is necessary
351
- is because the text of fast-export may refer to an object using a different
352
- mark than the mark that was assigned to that object using IDS.new(). This
353
- class allows you to translate the fast-export marks (old) to the marks
354
- assigned from IDS.new() (new).
355
-
356
- Note that there are two reasons why the marks may differ: (1) The
357
- user manually creates Blob or Commit objects (for insertion into the
358
- stream) (2) We're reading the data from two different repositories
359
- and trying to combine the data (git fast-export will number ids from
360
- 1...n, and having two 1's, two 2's, two 3's, causes issues).
361
- """
362
-
363
- def __init__(self):
364
- """
365
- Init
366
- """
367
- # The id for the next created blob/commit object
368
- self._next_id = 1
369
-
370
- # A map of old-ids to new-ids (1:1 map)
371
- self._translation = {}
372
-
373
- # A map of new-ids to every old-id that points to the new-id (1:N map)
374
- self._reverse_translation = {}
375
-
376
- def has_renames(self):
377
- """
378
- Return whether there have been ids remapped to new values
379
- """
380
- return bool(self._translation)
381
-
382
- def new(self):
383
- """
384
- Should be called whenever a new blob or commit object is created. The
385
- returned value should be used as the id/mark for that object.
386
- """
387
- rv = self._next_id
388
- self._next_id += 1
389
- return rv
390
-
391
- def record_rename(self, old_id, new_id, handle_transitivity = False):
392
- """
393
- Record that old_id is being renamed to new_id.
394
- """
395
- if old_id != new_id:
396
- # old_id -> new_id
397
- self._translation[old_id] = new_id
398
-
399
- # Transitivity will be needed if new commits are being inserted mid-way
400
- # through a branch.
401
- if handle_transitivity:
402
- # Anything that points to old_id should point to new_id
403
- if old_id in self._reverse_translation:
404
- for id_ in self._reverse_translation[old_id]:
405
- self._translation[id_] = new_id
406
-
407
- # Record that new_id is pointed to by old_id
408
- if new_id not in self._reverse_translation:
409
- self._reverse_translation[new_id] = []
410
- self._reverse_translation[new_id].append(old_id)
411
-
412
- def translate(self, old_id):
413
- """
414
- If old_id has been mapped to an alternate id, return the alternate id.
415
- """
416
- if old_id in self._translation:
417
- return self._translation[old_id]
418
- else:
419
- return old_id
420
-
421
- def __str__(self):
422
- """
423
- Convert IDs to string; used for debugging
424
- """
425
- rv = "Current count: %d\nTranslation:\n" % self._next_id
426
- for k in sorted(self._translation):
427
- rv += " %d -> %s\n" % (k, self._translation[k])
428
-
429
- rv += "Reverse translation:\n"
430
- for k in sorted(self._reverse_translation):
431
- rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n"
432
-
433
- return rv
434
-
435
- class _GitElement(object):
436
- """
437
- The base class for all git elements that we create.
438
- """
439
-
440
- def __init__(self):
441
- # A string that describes what type of Git element this is
442
- self.type = None
443
-
444
- # A flag telling us if this Git element has been dumped
445
- # (i.e. printed) or skipped. Typically elements that have been
446
- # dumped or skipped will not be dumped again.
447
- self.dumped = 0
448
-
449
- def dump(self, file_):
450
- """
451
- This version should never be called. Derived classes need to
452
- override! We should note that subclasses should implement this
453
- method such that the output would match the format produced by
454
- fast-export.
455
- """
456
- raise SystemExit(_("Unimplemented function: %s") % type(self).__name__
457
- +".dump()") # pragma: no cover
458
-
459
- def __bytes__(self):
460
- """
461
- Convert GitElement to bytestring; used for debugging
462
- """
463
- old_dumped = self.dumped
464
- writeme = io.BytesIO()
465
- self.dump(writeme)
466
- output_lines = writeme.getvalue().splitlines()
467
- writeme.close()
468
- self.dumped = old_dumped
469
- return b"%s:\n %s" % (type(self).__name__.encode(),
470
- b"\n ".join(output_lines))
471
-
472
- def skip(self, new_id=None):
473
- """
474
- Ensures this element will not be written to output
475
- """
476
- self.dumped = 2
477
-
478
- class _GitElementWithId(_GitElement):
479
- """
480
- The base class for Git elements that have IDs (commits and blobs)
481
- """
482
-
483
- def __init__(self):
484
- _GitElement.__init__(self)
485
-
486
- # The mark (short, portable id) for this element
487
- self.id = _IDS.new()
488
-
489
- # The previous mark for this element
490
- self.old_id = None
491
-
492
- def skip(self, new_id=None):
493
- """
494
- This element will no longer be automatically written to output. When a
495
- commit gets skipped, it's ID will need to be translated to that of its
496
- parent.
497
- """
498
- self.dumped = 2
499
-
500
- _IDS.record_rename(self.old_id or self.id, new_id)
501
-
502
- class Blob(_GitElementWithId):
503
- """
504
- This class defines our representation of git blob elements (i.e. our
505
- way of representing file contents).
506
- """
507
-
508
- def __init__(self, data, original_id = None):
509
- _GitElementWithId.__init__(self)
510
-
511
- # Denote that this is a blob
512
- self.type = 'blob'
513
-
514
- # Record original id
515
- self.original_id = original_id
516
-
517
- # Stores the blob's data
518
- assert(type(data) == bytes)
519
- self.data = data
520
-
521
- def dump(self, file_):
522
- """
523
- Write this blob element to a file.
524
- """
525
- self.dumped = 1
526
- HASH_TO_ID[self.original_id] = self.id
527
- ID_TO_HASH[self.id] = self.original_id
528
-
529
- file_.write(b'blob\n')
530
- file_.write(b'mark :%d\n' % self.id)
531
- file_.write(b'data %d\n%s' % (len(self.data), self.data))
532
- file_.write(b'\n')
533
-
534
-
535
- class Reset(_GitElement):
536
- """
537
- This class defines our representation of git reset elements. A reset
538
- event is the creation (or recreation) of a named branch, optionally
539
- starting from a specific revision).
540
- """
541
-
542
- def __init__(self, ref, from_ref = None):
543
- _GitElement.__init__(self)
544
-
545
- # Denote that this is a reset
546
- self.type = 'reset'
547
-
548
- # The name of the branch being (re)created
549
- self.ref = ref
550
-
551
- # Some reference to the branch/commit we are resetting from
552
- self.from_ref = from_ref
553
-
554
- def dump(self, file_):
555
- """
556
- Write this reset element to a file
557
- """
558
- self.dumped = 1
559
-
560
- file_.write(b'reset %s\n' % self.ref)
561
- if self.from_ref:
562
- if isinstance(self.from_ref, int):
563
- file_.write(b'from :%d\n' % self.from_ref)
564
- else:
565
- file_.write(b'from %s\n' % self.from_ref)
566
- file_.write(b'\n')
567
-
568
- class FileChange(_GitElement):
569
- """
570
- This class defines our representation of file change elements. File change
571
- elements are components within a Commit element.
572
- """
573
-
574
- def __init__(self, type_, filename = None, id_ = None, mode = None):
575
- _GitElement.__init__(self)
576
-
577
- # Denote the type of file-change (b'M' for modify, b'D' for delete, etc)
578
- # We could
579
- # assert(type(type_) == bytes)
580
- # here but I don't just due to worries about performance overhead...
581
- self.type = type_
582
-
583
- # Record the name of the file being changed
584
- self.filename = filename
585
-
586
- # Record the mode (mode describes type of file entry (non-executable,
587
- # executable, or symlink)).
588
- self.mode = mode
589
-
590
- # blob_id is the id (mark) of the affected blob
591
- self.blob_id = id_
592
-
593
- if type_ == b'DELETEALL':
594
- assert filename is None and id_ is None and mode is None
595
- self.filename = b'' # Just so PathQuoting.enquote doesn't die
596
- else:
597
- assert filename is not None
598
-
599
- if type_ == b'M':
600
- assert id_ is not None and mode is not None
601
- elif type_ == b'D':
602
- assert id_ is None and mode is None
603
- elif type_ == b'R': # pragma: no cover (now avoid fast-export renames)
604
- assert mode is None
605
- if id_ is None:
606
- raise SystemExit(_("new name needed for rename of %s") % filename)
607
- self.filename = (self.filename, id_)
608
- self.blob_id = None
609
-
610
- def dump(self, file_):
611
- """
612
- Write this file-change element to a file
613
- """
614
- skipped_blob = (self.type == b'M' and self.blob_id is None)
615
- if skipped_blob: return
616
- self.dumped = 1
617
-
618
- quoted_filename = PathQuoting.enquote(self.filename)
619
- if self.type == b'M' and isinstance(self.blob_id, int):
620
- file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename))
621
- elif self.type == b'M':
622
- file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename))
623
- elif self.type == b'D':
624
- file_.write(b'D %s\n' % quoted_filename)
625
- elif self.type == b'DELETEALL':
626
- file_.write(b'deleteall\n')
627
- else:
628
- raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover
629
-
630
- class Commit(_GitElementWithId):
631
- """
632
- This class defines our representation of commit elements. Commit elements
633
- contain all the information associated with a commit.
634
- """
635
-
636
- def __init__(self, branch,
637
- author_name, author_email, author_date,
638
- committer_name, committer_email, committer_date,
639
- message,
640
- file_changes,
641
- parents,
642
- original_id = None,
643
- encoding = None, # encoding for message; None implies UTF-8
644
- **kwargs):
645
- _GitElementWithId.__init__(self)
646
- self.old_id = self.id
647
-
648
- # Denote that this is a commit element
649
- self.type = 'commit'
650
-
651
- # Record the affected branch
652
- self.branch = branch
653
-
654
- # Record original id
655
- self.original_id = original_id
656
-
657
- # Record author's name
658
- self.author_name = author_name
659
-
660
- # Record author's email
661
- self.author_email = author_email
662
-
663
- # Record date of authoring
664
- self.author_date = author_date
665
-
666
- # Record committer's name
667
- self.committer_name = committer_name
668
-
669
- # Record committer's email
670
- self.committer_email = committer_email
671
-
672
- # Record date the commit was made
673
- self.committer_date = committer_date
674
-
675
- # Record commit message and its encoding
676
- self.encoding = encoding
677
- self.message = message
678
-
679
- # List of file-changes associated with this commit. Note that file-changes
680
- # are also represented as git elements
681
- self.file_changes = file_changes
682
-
683
- self.parents = parents
684
-
685
- def dump(self, file_):
686
- """
687
- Write this commit element to a file.
688
- """
689
- self.dumped = 1
690
- HASH_TO_ID[self.original_id] = self.id
691
- ID_TO_HASH[self.id] = self.original_id
692
-
693
- # Make output to fast-import slightly easier for humans to read if the
694
- # message has no trailing newline of its own; cosmetic, but a nice touch...
695
- extra_newline = b'\n'
696
- if self.message.endswith(b'\n') or not (self.parents or self.file_changes):
697
- extra_newline = b''
698
-
699
- if not self.parents:
700
- file_.write(b'reset %s\n' % self.branch)
701
- file_.write((b'commit %s\n'
702
- b'mark :%d\n'
703
- b'author %s <%s> %s\n'
704
- b'committer %s <%s> %s\n'
705
- ) % (
706
- self.branch, self.id,
707
- self.author_name, self.author_email, self.author_date,
708
- self.committer_name, self.committer_email, self.committer_date
709
- ))
710
- if self.encoding:
711
- file_.write(b'encoding %s\n' % self.encoding)
712
- file_.write(b'data %d\n%s%s' %
713
- (len(self.message), self.message, extra_newline))
714
- for i, parent in enumerate(self.parents):
715
- file_.write(b'from ' if i==0 else b'merge ')
716
- if isinstance(parent, int):
717
- file_.write(b':%d\n' % parent)
718
- else:
719
- file_.write(b'%s\n' % parent)
720
- for change in self.file_changes:
721
- change.dump(file_)
722
- if not self.parents and not self.file_changes:
723
- # Workaround a bug in pre-git-2.22 versions of fast-import with
724
- # the get-mark directive.
725
- file_.write(b'\n')
726
- file_.write(b'\n')
727
-
728
- def first_parent(self):
729
- """
730
- Return first parent commit
731
- """
732
- if self.parents:
733
- return self.parents[0]
734
- return None
735
-
736
- def skip(self, new_id=None):
737
- _SKIPPED_COMMITS.add(self.old_id or self.id)
738
- _GitElementWithId.skip(self, new_id)
739
-
740
- class Tag(_GitElementWithId):
741
- """
742
- This class defines our representation of annotated tag elements.
743
- """
744
-
745
- def __init__(self, ref, from_ref,
746
- tagger_name, tagger_email, tagger_date, tag_msg,
747
- original_id = None):
748
- _GitElementWithId.__init__(self)
749
- self.old_id = self.id
750
-
751
- # Denote that this is a tag element
752
- self.type = 'tag'
753
-
754
- # Store the name of the tag
755
- self.ref = ref
756
-
757
- # Store the entity being tagged (this should be a commit)
758
- self.from_ref = from_ref
759
-
760
- # Record original id
761
- self.original_id = original_id
762
-
763
- # Store the name of the tagger
764
- self.tagger_name = tagger_name
765
-
766
- # Store the email of the tagger
767
- self.tagger_email = tagger_email
768
-
769
- # Store the date
770
- self.tagger_date = tagger_date
771
-
772
- # Store the tag message
773
- self.message = tag_msg
774
-
775
- def dump(self, file_):
776
- """
777
- Write this tag element to a file
778
- """
779
-
780
- self.dumped = 1
781
- HASH_TO_ID[self.original_id] = self.id
782
- ID_TO_HASH[self.id] = self.original_id
783
-
784
- file_.write(b'tag %s\n' % self.ref)
785
- if (write_marks and self.id):
786
- file_.write(b'mark :%d\n' % self.id)
787
- markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n'
788
- file_.write(markfmt % self.from_ref)
789
- if self.tagger_name:
790
- file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email))
791
- file_.write(self.tagger_date)
792
- file_.write(b'\n')
793
- file_.write(b'data %d\n%s' % (len(self.message), self.message))
794
- file_.write(b'\n')
795
-
796
- class Progress(_GitElement):
797
- """
798
- This class defines our representation of progress elements. The progress
799
- element only contains a progress message, which is printed by fast-import
800
- when it processes the progress output.
801
- """
802
-
803
- def __init__(self, message):
804
- _GitElement.__init__(self)
805
-
806
- # Denote that this is a progress element
807
- self.type = 'progress'
808
-
809
- # Store the progress message
810
- self.message = message
811
-
812
- def dump(self, file_):
813
- """
814
- Write this progress element to a file
815
- """
816
- self.dumped = 1
817
-
818
- file_.write(b'progress %s\n' % self.message)
819
- file_.write(b'\n')
820
-
821
- class Checkpoint(_GitElement):
822
- """
823
- This class defines our representation of checkpoint elements. These
824
- elements represent events which force fast-import to close the current
825
- packfile, start a new one, and to save out all current branch refs, tags
826
- and marks.
827
- """
828
-
829
- def __init__(self):
830
- _GitElement.__init__(self)
831
-
832
- # Denote that this is a checkpoint element
833
- self.type = 'checkpoint'
834
-
835
- def dump(self, file_):
836
- """
837
- Write this checkpoint element to a file
838
- """
839
- self.dumped = 1
840
-
841
- file_.write(b'checkpoint\n')
842
- file_.write(b'\n')
843
-
844
- class LiteralCommand(_GitElement):
845
- """
846
- This class defines our representation of commands. The literal command
847
- includes only a single line, and is not processed in any special way.
848
- """
849
-
850
- def __init__(self, line):
851
- _GitElement.__init__(self)
852
-
853
- # Denote that this is a literal element
854
- self.type = 'literal'
855
-
856
- # Store the command
857
- self.line = line
858
-
859
- def dump(self, file_):
860
- """
861
- Write this progress element to a file
862
- """
863
- self.dumped = 1
864
-
865
- file_.write(self.line)
866
-
867
- class Alias(_GitElement):
868
- """
869
- This class defines our representation of fast-import alias elements. An
870
- alias element is the setting of one mark to the same sha1sum as another,
871
- usually because the newer mark corresponded to a pruned commit.
872
- """
873
-
874
- def __init__(self, ref, to_ref):
875
- _GitElement.__init__(self)
876
- # Denote that this is a reset
877
- self.type = 'alias'
878
-
879
- self.ref = ref
880
- self.to_ref = to_ref
881
-
882
- def dump(self, file_):
883
- """
884
- Write this reset element to a file
885
- """
886
- self.dumped = 1
887
-
888
- file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref))
889
-
890
- class FastExportParser(object):
891
- """
892
- A class for parsing and handling the output from fast-export. This
893
- class allows the user to register callbacks when various types of
894
- data are encountered in the fast-export output. The basic idea is that,
895
- FastExportParser takes fast-export output, creates the various objects
896
- as it encounters them, the user gets to use/modify these objects via
897
- callbacks, and finally FastExportParser outputs the modified objects
898
- in fast-import format (presumably so they can be used to create a new
899
- repo).
900
- """
901
-
902
- def __init__(self,
903
- tag_callback = None, commit_callback = None,
904
- blob_callback = None, progress_callback = None,
905
- reset_callback = None, checkpoint_callback = None,
906
- done_callback = None):
907
- # Members below simply store callback functions for the various git
908
- # elements
909
- self._tag_callback = tag_callback
910
- self._blob_callback = blob_callback
911
- self._reset_callback = reset_callback
912
- self._commit_callback = commit_callback
913
- self._progress_callback = progress_callback
914
- self._checkpoint_callback = checkpoint_callback
915
- self._done_callback = done_callback
916
-
917
- # Keep track of which refs appear from the export, and which make it to
918
- # the import (pruning of empty commits, renaming of refs, and creating
919
- # new manual objects and inserting them can cause these to differ).
920
- self._exported_refs = set()
921
- self._imported_refs = set()
922
-
923
- # A list of the branches we've seen, plus the last known commit they
924
- # pointed to. An entry in latest_*commit will be deleted if we get a
925
- # reset for that branch. These are used because of fast-import's weird
926
- # decision to allow having an implicit parent via naming the branch
927
- # instead of requiring branches to be specified via 'from' directives.
928
- self._latest_commit = {}
929
- self._latest_orig_commit = {}
930
-
931
- # A handle to the input source for the fast-export data
932
- self._input = None
933
-
934
- # A handle to the output file for the output we generate (we call dump
935
- # on many of the git elements we create).
936
- self._output = None
937
-
938
- # Stores the contents of the current line of input being parsed
939
- self._currentline = ''
940
-
941
- # Compile some regexes and cache those
942
- self._mark_re = re.compile(br'mark :(\d+)\n$')
943
- self._parent_regexes = {}
944
- parent_regex_rules = (br' :(\d+)\n$', br' ([0-9a-f]{40})\n')
945
- for parent_refname in (b'from', b'merge'):
946
- ans = [re.compile(parent_refname+x) for x in parent_regex_rules]
947
- self._parent_regexes[parent_refname] = ans
948
- self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"')
949
- self._refline_regexes = {}
950
- for refline_name in (b'reset', b'commit', b'tag', b'progress'):
951
- self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$')
952
- self._user_regexes = {}
953
- for user in (b'author', b'committer', b'tagger'):
954
- self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$')
955
-
956
- def _advance_currentline(self):
957
- """
958
- Grab the next line of input
959
- """
960
- self._currentline = self._input.readline()
961
-
962
- def _parse_optional_mark(self):
963
- """
964
- If the current line contains a mark, parse it and advance to the
965
- next line; return None otherwise
966
- """
967
- mark = None
968
- matches = self._mark_re.match(self._currentline)
969
- if matches:
970
- mark = int(matches.group(1))
971
- self._advance_currentline()
972
- return mark
973
-
974
- def _parse_optional_parent_ref(self, refname):
975
- """
976
- If the current line contains a reference to a parent commit, then
977
- parse it and advance the current line; otherwise return None. Note
978
- that the name of the reference ('from', 'merge') must match the
979
- refname arg.
980
- """
981
- orig_baseref, baseref = None, None
982
- rule, altrule = self._parent_regexes[refname]
983
- matches = rule.match(self._currentline)
984
- if matches:
985
- orig_baseref = int(matches.group(1))
986
- # We translate the parent commit mark to what it needs to be in
987
- # our mark namespace
988
- baseref = _IDS.translate(orig_baseref)
989
- self._advance_currentline()
990
- else:
991
- matches = altrule.match(self._currentline)
992
- if matches:
993
- orig_baseref = matches.group(1)
994
- baseref = orig_baseref
995
- self._advance_currentline()
996
- return orig_baseref, baseref
997
-
998
- def _parse_optional_filechange(self):
999
- """
1000
- If the current line contains a file-change object, then parse it
1001
- and advance the current line; otherwise return None. We only care
1002
- about file changes of type b'M' and b'D' (these are the only types
1003
- of file-changes that fast-export will provide).
1004
- """
1005
- filechange = None
1006
- changetype = self._currentline[0:1]
1007
- if changetype == b'M':
1008
- (changetype, mode, idnum, path) = self._currentline.split(None, 3)
1009
- if idnum[0:1] == b':':
1010
- idnum = idnum[1:]
1011
- path = path.rstrip(b'\n')
1012
- # We translate the idnum to our id system
1013
- if len(idnum) != 40:
1014
- idnum = _IDS.translate( int(idnum) )
1015
- if idnum is not None:
1016
- if path.startswith(b'"'):
1017
- path = PathQuoting.dequote(path)
1018
- filechange = FileChange(b'M', path, idnum, mode)
1019
- else:
1020
- filechange = b'skipped'
1021
- self._advance_currentline()
1022
- elif changetype == b'D':
1023
- (changetype, path) = self._currentline.split(None, 1)
1024
- path = path.rstrip(b'\n')
1025
- if path.startswith(b'"'):
1026
- path = PathQuoting.dequote(path)
1027
- filechange = FileChange(b'D', path)
1028
- self._advance_currentline()
1029
- elif changetype == b'R': # pragma: no cover (now avoid fast-export renames)
1030
- rest = self._currentline[2:-1]
1031
- if rest.startswith(b'"'):
1032
- m = self._quoted_string_re.match(rest)
1033
- if not m:
1034
- raise SystemExit(_("Couldn't parse rename source"))
1035
- orig = PathQuoting.dequote(m.group(0))
1036
- new = rest[m.end()+1:]
1037
- else:
1038
- orig, new = rest.split(b' ', 1)
1039
- if new.startswith(b'"'):
1040
- new = PathQuoting.dequote(new)
1041
- filechange = FileChange(b'R', orig, new)
1042
- self._advance_currentline()
1043
- return filechange
1044
-
1045
- def _parse_original_id(self):
1046
- original_id = self._currentline[len(b'original-oid '):].rstrip()
1047
- self._advance_currentline()
1048
- return original_id
1049
-
1050
- def _parse_encoding(self):
1051
- encoding = self._currentline[len(b'encoding '):].rstrip()
1052
- self._advance_currentline()
1053
- return encoding
1054
-
1055
- def _parse_ref_line(self, refname):
1056
- """
1057
- Parses string data (often a branch name) from current-line. The name of
1058
- the string data must match the refname arg. The program will crash if
1059
- current-line does not match, so current-line will always be advanced if
1060
- this method returns.
1061
- """
1062
- matches = self._refline_regexes[refname].match(self._currentline)
1063
- if not matches:
1064
- raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") %
1065
- ({'refname': refname, 'line':self._currentline})
1066
- ) # pragma: no cover
1067
- ref = matches.group(1)
1068
- self._advance_currentline()
1069
- return ref
1070
-
1071
- def _parse_user(self, usertype):
1072
- """
1073
- Get user name, email, datestamp from current-line. Current-line will
1074
- be advanced.
1075
- """
1076
- user_regex = self._user_regexes[usertype]
1077
- (name, email, when) = user_regex.match(self._currentline).groups()
1078
-
1079
- self._advance_currentline()
1080
- return (name, email, when)
1081
-
1082
- def _parse_data(self):
1083
- """
1084
- Reads data from _input. Current-line will be advanced until it is beyond
1085
- the data.
1086
- """
1087
- fields = self._currentline.split()
1088
- assert fields[0] == b'data'
1089
- size = int(fields[1])
1090
- data = self._input.read(size)
1091
- self._advance_currentline()
1092
- if self._currentline == b'\n':
1093
- self._advance_currentline()
1094
- return data
1095
-
1096
- def _parse_blob(self):
1097
- """
1098
- Parse input data into a Blob object. Once the Blob has been created, it
1099
- will be handed off to the appropriate callbacks. Current-line will be
1100
- advanced until it is beyond this blob's data. The Blob will be dumped
1101
- to _output once everything else is done (unless it has been skipped by
1102
- the callback).
1103
- """
1104
- # Parse the Blob
1105
- self._advance_currentline()
1106
- id_ = self._parse_optional_mark()
1107
-
1108
- original_id = None
1109
- if self._currentline.startswith(b'original-oid'):
1110
- original_id = self._parse_original_id();
1111
-
1112
- data = self._parse_data()
1113
- if self._currentline == b'\n':
1114
- self._advance_currentline()
1115
-
1116
- # Create the blob
1117
- blob = Blob(data, original_id)
1118
-
1119
- # If fast-export text had a mark for this blob, need to make sure this
1120
- # mark translates to the blob's true id.
1121
- if id_:
1122
- blob.old_id = id_
1123
- _IDS.record_rename(id_, blob.id)
1124
-
1125
- # Call any user callback to allow them to use/modify the blob
1126
- if self._blob_callback:
1127
- self._blob_callback(blob)
1128
-
1129
- # Now print the resulting blob
1130
- if not blob.dumped:
1131
- blob.dump(self._output)
1132
-
1133
- def _parse_reset(self):
1134
- """
1135
- Parse input data into a Reset object. Once the Reset has been created,
1136
- it will be handed off to the appropriate callbacks. Current-line will
1137
- be advanced until it is beyond the reset data. The Reset will be dumped
1138
- to _output once everything else is done (unless it has been skipped by
1139
- the callback).
1140
- """
1141
- # Parse the Reset
1142
- ref = self._parse_ref_line(b'reset')
1143
- self._exported_refs.add(ref)
1144
- ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
1145
- if self._currentline == b'\n':
1146
- self._advance_currentline()
1147
-
1148
- # fast-export likes to print extraneous resets that serve no purpose.
1149
- # While we could continue processing such resets, that is a waste of
1150
- # resources. Also, we want to avoid recording that this ref was
1151
- # seen in such cases, since this ref could be rewritten to nothing.
1152
- if not from_ref:
1153
- self._latest_commit.pop(ref, None)
1154
- self._latest_orig_commit.pop(ref, None)
1155
- return
1156
-
1157
- # Create the reset
1158
- reset = Reset(ref, from_ref)
1159
-
1160
- # Call any user callback to allow them to modify the reset
1161
- if self._reset_callback:
1162
- self._reset_callback(reset)
1163
-
1164
- # Update metadata
1165
- self._latest_commit[reset.ref] = reset.from_ref
1166
- self._latest_orig_commit[reset.ref] = reset.from_ref
1167
-
1168
- # Now print the resulting reset
1169
- if not reset.dumped:
1170
- self._imported_refs.add(reset.ref)
1171
- reset.dump(self._output)
1172
-
1173
- def _parse_commit(self):
1174
- """
1175
- Parse input data into a Commit object. Once the Commit has been created,
1176
- it will be handed off to the appropriate callbacks. Current-line will
1177
- be advanced until it is beyond the commit data. The Commit will be dumped
1178
- to _output once everything else is done (unless it has been skipped by
1179
- the callback OR the callback has removed all file-changes from the commit).
1180
- """
1181
- # Parse the Commit. This may look involved, but it's pretty simple; it only
1182
- # looks bad because a commit object contains many pieces of data.
1183
- branch = self._parse_ref_line(b'commit')
1184
- self._exported_refs.add(branch)
1185
- id_ = self._parse_optional_mark()
1186
-
1187
- original_id = None
1188
- if self._currentline.startswith(b'original-oid'):
1189
- original_id = self._parse_original_id();
1190
-
1191
- author_name = None
1192
- author_email = None
1193
- if self._currentline.startswith(b'author'):
1194
- (author_name, author_email, author_date) = self._parse_user(b'author')
1195
-
1196
- (committer_name, committer_email, committer_date) = \
1197
- self._parse_user(b'committer')
1198
-
1199
- if not author_name and not author_email:
1200
- (author_name, author_email, author_date) = \
1201
- (committer_name, committer_email, committer_date)
1202
-
1203
- encoding = None
1204
- if self._currentline.startswith(b'encoding '):
1205
- encoding = self._parse_encoding()
1206
-
1207
- commit_msg = self._parse_data()
1208
-
1209
- pinfo = [self._parse_optional_parent_ref(b'from')]
1210
- # Due to empty pruning, we can have real 'from' and 'merge' lines that
1211
- # due to commit rewriting map to a parent of None. We need to record
1212
- # 'from' if its non-None, and we need to parse all 'merge' lines.
1213
- while self._currentline.startswith(b'merge '):
1214
- pinfo.append(self._parse_optional_parent_ref(b'merge'))
1215
- orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)]
1216
-
1217
- # No parents is oddly represented as [None] instead of [], due to the
1218
- # special 'from' handling. Convert it here to a more canonical form.
1219
- if parents == [None]:
1220
- parents = []
1221
- if orig_parents == [None]:
1222
- orig_parents = []
1223
-
1224
- # fast-import format is kinda stupid in that it allows implicit parents
1225
- # based on the branch name instead of requiring them to be specified by
1226
- # 'from' directives. The only way to get no parent is by using a reset
1227
- # directive first, which clears the latest_commit_for_this_branch tracking.
1228
- if not orig_parents and self._latest_commit.get(branch):
1229
- parents = [self._latest_commit[branch]]
1230
- if not orig_parents and self._latest_orig_commit.get(branch):
1231
- orig_parents = [self._latest_orig_commit[branch]]
1232
-
1233
- # Get the list of file changes
1234
- file_changes = []
1235
- file_change = self._parse_optional_filechange()
1236
- had_file_changes = file_change is not None
1237
- while file_change:
1238
- if not (type(file_change) == bytes and file_change == b'skipped'):
1239
- file_changes.append(file_change)
1240
- file_change = self._parse_optional_filechange()
1241
- if self._currentline == b'\n':
1242
- self._advance_currentline()
1243
-
1244
- # Okay, now we can finally create the Commit object
1245
- commit = Commit(branch,
1246
- author_name, author_email, author_date,
1247
- committer_name, committer_email, committer_date,
1248
- commit_msg, file_changes, parents, original_id, encoding)
1249
-
1250
- # If fast-export text had a mark for this commit, need to make sure this
1251
- # mark translates to the commit's true id.
1252
- if id_:
1253
- commit.old_id = id_
1254
- _IDS.record_rename(id_, commit.id)
1255
-
1256
- # Call any user callback to allow them to modify the commit
1257
- aux_info = {'orig_parents': orig_parents,
1258
- 'had_file_changes': had_file_changes}
1259
- if self._commit_callback:
1260
- self._commit_callback(commit, aux_info)
1261
-
1262
- # Now print the resulting commit, or if prunable skip it
1263
- self._latest_orig_commit[branch] = commit.id
1264
- if not (commit.old_id or commit.id) in _SKIPPED_COMMITS:
1265
- self._latest_commit[branch] = commit.id
1266
- if not commit.dumped:
1267
- self._imported_refs.add(commit.branch)
1268
- commit.dump(self._output)
1269
-
1270
- def _parse_tag(self):
1271
- """
1272
- Parse input data into a Tag object. Once the Tag has been created,
1273
- it will be handed off to the appropriate callbacks. Current-line will
1274
- be advanced until it is beyond the tag data. The Tag will be dumped
1275
- to _output once everything else is done (unless it has been skipped by
1276
- the callback).
1277
- """
1278
- # Parse the Tag
1279
- tag = self._parse_ref_line(b'tag')
1280
- self._exported_refs.add(b'refs/tags/'+tag)
1281
- id_ = self._parse_optional_mark()
1282
- ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
1283
-
1284
- original_id = None
1285
- if self._currentline.startswith(b'original-oid'):
1286
- original_id = self._parse_original_id();
1287
-
1288
- tagger_name, tagger_email, tagger_date = None, None, None
1289
- if self._currentline.startswith(b'tagger'):
1290
- (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger')
1291
- tag_msg = self._parse_data()
1292
- if self._currentline == b'\n':
1293
- self._advance_currentline()
1294
-
1295
- # Create the tag
1296
- tag = Tag(tag, from_ref,
1297
- tagger_name, tagger_email, tagger_date, tag_msg,
1298
- original_id)
1299
-
1300
- # If fast-export text had a mark for this tag, need to make sure this
1301
- # mark translates to the tag's true id.
1302
- if id_:
1303
- tag.old_id = id_
1304
- _IDS.record_rename(id_, tag.id)
1305
-
1306
- # Call any user callback to allow them to modify the tag
1307
- if self._tag_callback:
1308
- self._tag_callback(tag)
1309
-
1310
- # The tag might not point at anything that still exists (self.from_ref
1311
- # will be None if the commit it pointed to and all its ancestors were
1312
- # pruned due to being empty)
1313
- if tag.from_ref:
1314
- # Print out this tag's information
1315
- if not tag.dumped:
1316
- self._imported_refs.add(b'refs/tags/'+tag.ref)
1317
- tag.dump(self._output)
1318
- else:
1319
- tag.skip()
1320
-
1321
- def _parse_progress(self):
1322
- """
1323
- Parse input data into a Progress object. Once the Progress has
1324
- been created, it will be handed off to the appropriate
1325
- callbacks. Current-line will be advanced until it is beyond the
1326
- progress data. The Progress will be dumped to _output once
1327
- everything else is done (unless it has been skipped by the callback).
1328
- """
1329
- # Parse the Progress
1330
- message = self._parse_ref_line(b'progress')
1331
- if self._currentline == b'\n':
1332
- self._advance_currentline()
1333
-
1334
- # Create the progress message
1335
- progress = Progress(message)
1336
-
1337
- # Call any user callback to allow them to modify the progress messsage
1338
- if self._progress_callback:
1339
- self._progress_callback(progress)
1340
-
1341
- # NOTE: By default, we do NOT print the progress message; git
1342
- # fast-import would write it to fast_import_pipes which could mess with
1343
- # our parsing of output from the 'ls' and 'get-mark' directives we send
1344
- # to fast-import. If users want these messages, they need to process
1345
- # and handle them in the appropriate callback above.
1346
-
1347
- def _parse_checkpoint(self):
1348
- """
1349
- Parse input data into a Checkpoint object. Once the Checkpoint has
1350
- been created, it will be handed off to the appropriate
1351
- callbacks. Current-line will be advanced until it is beyond the
1352
- checkpoint data. The Checkpoint will be dumped to _output once
1353
- everything else is done (unless it has been skipped by the callback).
1354
- """
1355
- # Parse the Checkpoint
1356
- self._advance_currentline()
1357
- if self._currentline == b'\n':
1358
- self._advance_currentline()
1359
-
1360
- # Create the checkpoint
1361
- checkpoint = Checkpoint()
1362
-
1363
- # Call any user callback to allow them to drop the checkpoint
1364
- if self._checkpoint_callback:
1365
- self._checkpoint_callback(checkpoint)
1366
-
1367
- # NOTE: By default, we do NOT print the checkpoint message; although it
1368
- # we would only realistically get them with --stdin, the fact that we
1369
- # are filtering makes me think the checkpointing is less likely to be
1370
- # reasonable. In fact, I don't think it's necessary in general. If
1371
- # users do want it, they should process it in the checkpoint_callback.
1372
-
1373
- def _parse_literal_command(self):
1374
- """
1375
- Parse literal command. Then just dump the line as is.
1376
- """
1377
- # Create the literal command object
1378
- command = LiteralCommand(self._currentline)
1379
- self._advance_currentline()
1380
-
1381
- # Now print the resulting literal command
1382
- if not command.dumped:
1383
- command.dump(self._output)
1384
-
1385
- def insert(self, obj):
1386
- assert not obj.dumped
1387
- obj.dump(self._output)
1388
- if type(obj) == Commit:
1389
- self._imported_refs.add(obj.branch)
1390
- elif type(obj) in (Reset, Tag):
1391
- self._imported_refs.add(obj.ref)
1392
-
1393
- def run(self, input, output):
1394
- """
1395
- This method filters fast export output.
1396
- """
1397
- # Set input. If no args provided, use stdin.
1398
- self._input = input
1399
- self._output = output
1400
-
1401
- # Run over the input and do the filtering
1402
- self._advance_currentline()
1403
- while self._currentline:
1404
- if self._currentline.startswith(b'blob'):
1405
- self._parse_blob()
1406
- elif self._currentline.startswith(b'reset'):
1407
- self._parse_reset()
1408
- elif self._currentline.startswith(b'commit'):
1409
- self._parse_commit()
1410
- elif self._currentline.startswith(b'tag'):
1411
- self._parse_tag()
1412
- elif self._currentline.startswith(b'progress'):
1413
- self._parse_progress()
1414
- elif self._currentline.startswith(b'checkpoint'):
1415
- self._parse_checkpoint()
1416
- elif self._currentline.startswith(b'feature'):
1417
- self._parse_literal_command()
1418
- elif self._currentline.startswith(b'option'):
1419
- self._parse_literal_command()
1420
- elif self._currentline.startswith(b'done'):
1421
- if self._done_callback:
1422
- self._done_callback()
1423
- self._parse_literal_command()
1424
- # Prevent confusion from others writing additional stuff that'll just
1425
- # be ignored
1426
- self._output.close()
1427
- elif self._currentline.startswith(b'#'):
1428
- self._parse_literal_command()
1429
- elif self._currentline.startswith(b'get-mark') or \
1430
- self._currentline.startswith(b'cat-blob') or \
1431
- self._currentline.startswith(b'ls'):
1432
- raise SystemExit(_("Unsupported command: '%s'") % self._currentline)
1433
- else:
1434
- raise SystemExit(_("Could not parse line: '%s'") % self._currentline)
1435
-
1436
- def get_exported_and_imported_refs(self):
1437
- return self._exported_refs, self._imported_refs
1438
-
1439
- def record_id_rename(old_id, new_id):
1440
- """
1441
- Register a new translation
1442
- """
1443
- handle_transitivity = True
1444
- _IDS.record_rename(old_id, new_id, handle_transitivity)
1445
-
1446
- # Internal globals
1447
- _IDS = _IDs()
1448
- _SKIPPED_COMMITS = set()
1449
- HASH_TO_ID = {}
1450
- ID_TO_HASH = {}
1451
-
1452
- class SubprocessWrapper(object):
1453
- @staticmethod
1454
- def decodify(args):
1455
- if type(args) == str:
1456
- return args
1457
- else:
1458
- assert type(args) == list
1459
- return [decode(x) if type(x)==bytes else x for x in args]
1460
-
1461
- @staticmethod
1462
- def call(*args, **kwargs):
1463
- if 'cwd' in kwargs:
1464
- kwargs['cwd'] = decode(kwargs['cwd'])
1465
- return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs)
1466
-
1467
- @staticmethod
1468
- def check_output(*args, **kwargs):
1469
- if 'cwd' in kwargs:
1470
- kwargs['cwd'] = decode(kwargs['cwd'])
1471
- return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs)
1472
-
1473
- @staticmethod
1474
- def check_call(*args, **kwargs): # pragma: no cover # used by filter-lamely
1475
- if 'cwd' in kwargs:
1476
- kwargs['cwd'] = decode(kwargs['cwd'])
1477
- return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs)
1478
-
1479
- @staticmethod
1480
- def Popen(*args, **kwargs):
1481
- if 'cwd' in kwargs:
1482
- kwargs['cwd'] = decode(kwargs['cwd'])
1483
- return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs)
1484
-
1485
- subproc = subprocess
1486
- if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ:
1487
- subproc = SubprocessWrapper
1488
-
1489
- class GitUtils(object):
1490
- @staticmethod
1491
- def get_commit_count(repo, *args):
1492
- """
1493
- Return the number of commits that have been made on repo.
1494
- """
1495
- if not args:
1496
- args = ['--all']
1497
- if len(args) == 1 and isinstance(args[0], list):
1498
- args = args[0]
1499
- p = subproc.Popen(["git", "rev-list", "--count"] + args,
1500
- stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1501
- cwd=repo)
1502
- if p.wait() != 0:
1503
- raise SystemExit(_("%s does not appear to be a valid git repository")
1504
- % decode(repo))
1505
- return int(p.stdout.read())
1506
-
1507
- @staticmethod
1508
- def get_total_objects(repo):
1509
- """
1510
- Return the number of objects (both packed and unpacked)
1511
- """
1512
- p1 = subproc.Popen(["git", "count-objects", "-v"],
1513
- stdout=subprocess.PIPE, cwd=repo)
1514
- lines = p1.stdout.read().splitlines()
1515
- # Return unpacked objects + packed-objects
1516
- return int(lines[0].split()[1]) + int(lines[2].split()[1])
1517
-
1518
- @staticmethod
1519
- def is_repository_bare(repo_working_dir):
1520
- out = subproc.check_output('git rev-parse --is-bare-repository'.split(),
1521
- cwd=repo_working_dir)
1522
- return (out.strip() == b'true')
1523
-
1524
- @staticmethod
1525
- def determine_git_dir(repo_working_dir):
1526
- d = subproc.check_output('git rev-parse --git-dir'.split(),
1527
- cwd=repo_working_dir).strip()
1528
- if repo_working_dir==b'.' or d.startswith(b'/'):
1529
- return d
1530
- return os.path.join(repo_working_dir, d)
1531
-
1532
- @staticmethod
1533
- def get_refs(repo_working_dir):
1534
- try:
1535
- output = subproc.check_output('git show-ref'.split(),
1536
- cwd=repo_working_dir)
1537
- except subprocess.CalledProcessError as e:
1538
- # If error code is 1, there just aren't any refs; i.e. new repo.
1539
- # If error code is other than 1, some other error (e.g. not a git repo)
1540
- if e.returncode != 1:
1541
- raise SystemExit('fatal: {}'.format(e))
1542
- output = ''
1543
- return dict(reversed(x.split()) for x in output.splitlines())
1544
-
1545
- @staticmethod
1546
- def get_blob_sizes(quiet = False):
1547
- blob_size_progress = ProgressWriter()
1548
- num_blobs = 0
1549
- processed_blobs_msg = _("Processed %d blob sizes")
1550
-
1551
- # Get sizes of blobs by sha1
1552
- cmd = '--batch-check=%(objectname) %(objecttype) ' + \
1553
- '%(objectsize) %(objectsize:disk)'
1554
- cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
1555
- bufsize = -1,
1556
- stdout = subprocess.PIPE)
1557
- unpacked_size = {}
1558
- packed_size = {}
1559
- for line in cf.stdout:
1560
- sha, objtype, objsize, objdisksize = line.split()
1561
- objsize, objdisksize = int(objsize), int(objdisksize)
1562
- if objtype == b'blob':
1563
- unpacked_size[sha] = objsize
1564
- packed_size[sha] = objdisksize
1565
- num_blobs += 1
1566
- if not quiet:
1567
- blob_size_progress.show(processed_blobs_msg % num_blobs)
1568
- cf.wait()
1569
- if not quiet:
1570
- blob_size_progress.finish()
1571
- return unpacked_size, packed_size
1572
-
1573
- @staticmethod
1574
- def get_file_changes(repo, parent_hash, commit_hash):
1575
- """
1576
- Return a FileChanges list with the differences between parent_hash
1577
- and commit_hash
1578
- """
1579
- file_changes = []
1580
-
1581
- cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash]
1582
- output = subproc.check_output(cmd, cwd=repo)
1583
- for line in output.splitlines():
1584
- fileinfo, path = line.split(b'\t', 1)
1585
- if path.startswith(b'"'):
1586
- path = PathQuoting.dequote(path)
1587
- oldmode, mode, oldhash, newhash, changetype = fileinfo.split()
1588
- if changetype == b'D':
1589
- file_changes.append(FileChange(b'D', path))
1590
- elif changetype in (b'A', b'M', b'T'):
1591
- identifier = HASH_TO_ID.get(newhash, newhash)
1592
- file_changes.append(FileChange(b'M', path, identifier, mode))
1593
- else: # pragma: no cover
1594
- raise SystemExit("Unknown change type for line {}".format(line))
1595
-
1596
- return file_changes
1597
-
1598
- @staticmethod
1599
- def print_my_version():
1600
- with open(__file__, 'br') as f:
1601
- contents = f.read()
1602
- # If people replaced @@LOCALEDIR@@ string to point at their local
1603
- # directory, undo it so we can get original source version.
1604
- contents = re.sub(br'\A#\!.*',
1605
- br'#!/usr/bin/env python3', contents)
1606
- contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"',
1607
- br'\1@@LOCALEDIR@@"', contents)
1608
-
1609
- cmd = 'git hash-object --stdin'.split()
1610
- version = subproc.check_output(cmd, input=contents).strip()
1611
- print(decode(version[0:12]))
1612
-
1613
- class FilteringOptions(object):
1614
- default_replace_text = b'***REMOVED***'
1615
- class AppendFilter(argparse.Action):
1616
- def __call__(self, parser, namespace, values, option_string=None):
1617
- user_path = values
1618
- suffix = option_string[len('--path-'):] or 'match'
1619
- if suffix.startswith('rename'):
1620
- mod_type = 'rename'
1621
- match_type = option_string[len('--path-rename-'):] or 'match'
1622
- values = values.split(b':')
1623
- if len(values) != 2:
1624
- raise SystemExit(_("Error: --path-rename expects one colon in its"
1625
- " argument: <old_name:new_name>."))
1626
- if values[0] and values[1] and not (
1627
- values[0].endswith(b'/') == values[1].endswith(b'/')):
1628
- raise SystemExit(_("Error: With --path-rename, if OLD_NAME and "
1629
- "NEW_NAME are both non-empty and either ends "
1630
- "with a slash then both must."))
1631
- if any(v.startswith(b'/') for v in values):
1632
- raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
1633
- components = values[0].split(b'/') + values[1].split(b'/')
1634
- else:
1635
- mod_type = 'filter'
1636
- match_type = suffix
1637
- components = values.split(b'/')
1638
- if values.startswith(b'/'):
1639
- raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
1640
- for illegal_path in [b'.', b'..']:
1641
- if illegal_path in components:
1642
- raise SystemExit(_("Error: Invalid path component '%s' found in '%s'")
1643
- % (decode(illegal_path), decode(user_path)))
1644
- if match_type == 'regex':
1645
- values = re.compile(values)
1646
- items = getattr(namespace, self.dest, []) or []
1647
- items.append((mod_type, match_type, values))
1648
- if (match_type, mod_type) == ('glob', 'filter'):
1649
- if not values.endswith(b'*'):
1650
- extension = b'*' if values.endswith(b'/') else b'/*'
1651
- items.append((mod_type, match_type, values+extension))
1652
- setattr(namespace, self.dest, items)
1653
-
1654
- class HelperFilter(argparse.Action):
1655
- def __call__(self, parser, namespace, values, option_string=None):
1656
- af = FilteringOptions.AppendFilter(dest='path_changes',
1657
- option_strings=None)
1658
- dirname = values if values[-1:] == b'/' else values+b'/'
1659
- if option_string == '--subdirectory-filter':
1660
- af(parser, namespace, dirname, '--path-match')
1661
- af(parser, namespace, dirname+b':', '--path-rename')
1662
- elif option_string == '--to-subdirectory-filter':
1663
- af(parser, namespace, b':'+dirname, '--path-rename')
1664
- else:
1665
- raise SystemExit(_("Error: HelperFilter given invalid option_string: %s")
1666
- % option_string) # pragma: no cover
1667
-
1668
- class FileWithPathsFilter(argparse.Action):
1669
- def __call__(self, parser, namespace, values, option_string=None):
1670
- if not namespace.path_changes:
1671
- namespace.path_changes = []
1672
- namespace.path_changes += FilteringOptions.get_paths_from_file(values)
1673
-
1674
- @staticmethod
1675
- def create_arg_parser():
1676
- # Include usage in the summary, so we can put the description first
1677
- summary = _('''Rewrite (or analyze) repository history
1678
-
1679
- git-filter-repo destructively rewrites history (unless --analyze or
1680
- --dry-run are given) according to specified rules. It refuses to do any
1681
- rewriting unless either run from a clean fresh clone, or --force was
1682
- given.
1683
-
1684
- Basic Usage:
1685
- git-filter-repo --analyze
1686
- git-filter-repo [FILTER/RENAME/CONTROL OPTIONS]
1687
-
1688
- See EXAMPLES section for details.
1689
- ''').rstrip()
1690
-
1691
- # Provide a long helpful examples section
1692
- example_text = _('''CALLBACKS
1693
-
1694
- All callback functions are of the same general format. For a command line
1695
- argument like
1696
- --foo-callback 'BODY'
1697
-
1698
- the following code will be compiled and called:
1699
- def foo_callback(foo):
1700
- BODY
1701
-
1702
- Thus, to replace 'Jon' with 'John' in author/committer/tagger names:
1703
- git filter-repo --name-callback 'return name.replace(b"Jon", b"John")'
1704
-
1705
- To remove all 'Tested-by' tags in commit (or tag) messages:
1706
- git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)'
1707
-
1708
- To remove all .DS_Store files:
1709
- git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename'
1710
-
1711
- Note that if BODY resolves to a filename, then the contents of that file
1712
- will be used as the BODY in the callback function.
1713
-
1714
- For more detailed examples and explanations AND caveats, see
1715
- https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS
1716
-
1717
- EXAMPLES
1718
-
1719
- To get a bunch of reports mentioning renames that have occurred in
1720
- your repo and listing sizes of objects aggregated by any of path,
1721
- directory, extension, or blob-id:
1722
- git filter-repo --analyze
1723
-
1724
- (These reports can help you choose how to filter your repo; it can
1725
- be useful to re-run this command after filtering to regenerate the
1726
- report and verify the changes look correct.)
1727
-
1728
- To extract the history that touched just 'guides' and 'tools/releases':
1729
- git filter-repo --path guides/ --path tools/releases
1730
-
1731
- To remove foo.zip and bar/baz/zips from every revision in history:
1732
- git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths
1733
-
1734
- To replace the text 'password' with 'p455w0rd':
1735
- git filter-repo --replace-text <(echo "password==>p455w0rd")
1736
-
1737
- To use the current version of the .mailmap file to update authors,
1738
- committers, and taggers throughout history and make it permanent:
1739
- git filter-repo --use-mailmap
1740
-
1741
- To extract the history of 'src/', rename all files to have a new leading
1742
- directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and
1743
- add a 'my-module-' prefix to all tags:
1744
- git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-'
1745
-
1746
- For more detailed examples and explanations, see
1747
- https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''')
1748
-
1749
- # Create the basic parser
1750
- parser = argparse.ArgumentParser(description=summary,
1751
- usage = argparse.SUPPRESS,
1752
- add_help = False,
1753
- epilog = example_text,
1754
- formatter_class=argparse.RawDescriptionHelpFormatter)
1755
-
1756
- analyze = parser.add_argument_group(title=_("Analysis"))
1757
- analyze.add_argument('--analyze', action='store_true',
1758
- help=_("Analyze repository history and create a report that may be "
1759
- "useful in determining what to filter in a subsequent run. "
1760
- "Will not modify your repo."))
1761
- analyze.add_argument('--report-dir',
1762
- metavar='DIR_OR_FILE',
1763
- type=os.fsencode,
1764
- dest='report_dir',
1765
- help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis,"
1766
- "refuses to run if exists, --force delete existing dir first."))
1767
-
1768
- path = parser.add_argument_group(title=_("Filtering based on paths "
1769
- "(see also --filename-callback)"),
1770
- description=textwrap.dedent(_("""
1771
- These options specify the paths to select. Note that much like git
1772
- itself, renames are NOT followed so you may need to specify multiple
1773
- paths, e.g. `--path olddir/ --path newdir/`
1774
- """[1:])))
1775
-
1776
- path.add_argument('--invert-paths', action='store_false', dest='inclusive',
1777
- help=_("Invert the selection of files from the specified "
1778
- "--path-{match,glob,regex} options below, i.e. only select "
1779
- "files matching none of those options."))
1780
-
1781
- path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE',
1782
- type=os.fsencode,
1783
- action=FilteringOptions.AppendFilter, dest='path_changes',
1784
- help=_("Exact paths (files or directories) to include in filtered "
1785
- "history. Multiple --path options can be specified to get "
1786
- "a union of paths."))
1787
- path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode,
1788
- action=FilteringOptions.AppendFilter, dest='path_changes',
1789
- help=_("Glob of paths to include in filtered history. Multiple "
1790
- "--path-glob options can be specified to get a union of "
1791
- "paths."))
1792
- path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode,
1793
- action=FilteringOptions.AppendFilter, dest='path_changes',
1794
- help=_("Regex of paths to include in filtered history. Multiple "
1795
- "--path-regex options can be specified to get a union of "
1796
- "paths"))
1797
- path.add_argument('--use-base-name', action='store_true',
1798
- help=_("Match on file base name instead of full path from the top "
1799
- "of the repo. Incompatible with --path-rename, and "
1800
- "incompatible with matching against directory names."))
1801
-
1802
- rename = parser.add_argument_group(title=_("Renaming based on paths "
1803
- "(see also --filename-callback)"))
1804
- rename.add_argument('--path-rename', '--path-rename-match',
1805
- metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode,
1806
- action=FilteringOptions.AppendFilter,
1807
- help=_("Path to rename; if filename or directory matches OLD_NAME "
1808
- "rename to NEW_NAME. Multiple --path-rename options can be "
1809
- "specified. NOTE: If you combine filtering options with "
1810
- "renaming ones, do not rely on a rename argument to select "
1811
- "paths; you also need a filter to select them."))
1812
-
1813
- helpers = parser.add_argument_group(title=_("Path shortcuts"))
1814
- helpers.add_argument('--paths-from-file', metavar='FILENAME',
1815
- type=os.fsencode,
1816
- action=FilteringOptions.FileWithPathsFilter, dest='path_changes',
1817
- help=_("Specify several path filtering and renaming directives, one "
1818
- "per line. Lines with '==>' in them specify path renames, "
1819
- "and lines can begin with 'literal:' (the default), 'glob:', "
1820
- "or 'regex:' to specify different matching styles. Blank "
1821
- "lines and lines starting with a '#' are ignored."))
1822
- helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY',
1823
- action=FilteringOptions.HelperFilter, type=os.fsencode,
1824
- help=_("Only look at history that touches the given subdirectory "
1825
- "and treat that directory as the project root. Equivalent "
1826
- "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'"))
1827
- helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY',
1828
- action=FilteringOptions.HelperFilter, type=os.fsencode,
1829
- help=_("Treat the project root as instead being under DIRECTORY. "
1830
- "Equivalent to using '--path-rename :DIRECTORY/'"))
1831
-
1832
- contents = parser.add_argument_group(title=_("Content editing filters "
1833
- "(see also --blob-callback)"))
1834
- contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE',
1835
- help=_("A file with expressions that, if found, will be replaced. "
1836
- "By default, each expression is treated as literal text, "
1837
- "but 'regex:' and 'glob:' prefixes are supported. You can "
1838
- "end the line with '==>' and some replacement text to "
1839
- "choose a replacement choice other than the default of '{}'."
1840
- .format(decode(FilteringOptions.default_replace_text))))
1841
- contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE',
1842
- dest='max_blob_size', default=0,
1843
- help=_("Strip blobs (files) bigger than specified size (e.g. '5M', "
1844
- "'2G', etc)"))
1845
- contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME',
1846
- help=_("Read git object ids from each line of the given file, and "
1847
- "strip all of them from history"))
1848
-
1849
- refrename = parser.add_argument_group(title=_("Renaming of refs "
1850
- "(see also --refname-callback)"))
1851
- refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode,
1852
- help=_("Rename tags starting with OLD to start with NEW. For "
1853
- "example, --tag-rename foo:bar will rename tag foo-1.2.3 "
1854
- "to bar-1.2.3; either OLD or NEW can be empty."))
1855
-
1856
- messages = parser.add_argument_group(title=_("Filtering of commit messages "
1857
- "(see also --message-callback)"))
1858
- messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE',
1859
- help=_("A file with expressions that, if found in commit messages, "
1860
- "will be replaced. This file uses the same syntax as "
1861
- "--replace-text."))
1862
- messages.add_argument('--preserve-commit-hashes', action='store_true',
1863
- help=_("By default, since commits are rewritten and thus gain new "
1864
- "hashes, references to old commit hashes in commit messages "
1865
- "are replaced with new commit hashes (abbreviated to the same "
1866
- "length as the old reference). Use this flag to turn off "
1867
- "updating commit hashes in commit messages."))
1868
- messages.add_argument('--preserve-commit-encoding', action='store_true',
1869
- help=_("Do not reencode commit messages into UTF-8. By default, if "
1870
- "the commit object specifies an encoding for the commit "
1871
- "message, the message is re-encoded into UTF-8."))
1872
-
1873
- people = parser.add_argument_group(title=_("Filtering of names & emails "
1874
- "(see also --name-callback "
1875
- "and --email-callback)"))
1876
- people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME',
1877
- type=os.fsencode,
1878
- help=_("Use specified mailmap file (see git-shortlog(1) for "
1879
- "details on the format) when rewriting author, committer, "
1880
- "and tagger names and emails. If the specified file is "
1881
- "part of git history, historical versions of the file will "
1882
- "be ignored; only the current contents are consulted."))
1883
- people.add_argument('--use-mailmap', dest='mailmap',
1884
- action='store_const', const=b'.mailmap',
1885
- help=_("Same as: '--mailmap .mailmap' "))
1886
-
1887
- parents = parser.add_argument_group(title=_("Parent rewriting"))
1888
- parents.add_argument('--replace-refs', default=None,
1889
- choices=['delete-no-add', 'delete-and-add',
1890
- 'update-no-add', 'update-or-add',
1891
- 'update-and-add'],
1892
- help=_("Replace refs (see git-replace(1)) are used to rewrite "
1893
- "parents (unless turned off by the usual git mechanism); this "
1894
- "flag specifies what do do with those refs afterward. "
1895
- "Replace refs can either be deleted or updated to point at new "
1896
- "commit hashes. Also, new replace refs can be added for each "
1897
- "commit rewrite. With 'update-or-add', new replace refs are "
1898
- "only added for commit rewrites that aren't used to update an "
1899
- "existing replace ref. default is 'update-and-add' if "
1900
- "$GIT_DIR/filter-repo/already_ran does not exist; "
1901
- "'update-or-add' otherwise."))
1902
- parents.add_argument('--prune-empty', default='auto',
1903
- choices=['always', 'auto', 'never'],
1904
- help=_("Whether to prune empty commits. 'auto' (the default) means "
1905
- "only prune commits which become empty (not commits which were "
1906
- "empty in the original repo, unless their parent was pruned). "
1907
- "When the parent of a commit is pruned, the first non-pruned "
1908
- "ancestor becomes the new parent."))
1909
- parents.add_argument('--prune-degenerate', default='auto',
1910
- choices=['always', 'auto', 'never'],
1911
- help=_("Since merge commits are needed for history topology, they "
1912
- "are typically exempt from pruning. However, they can become "
1913
- "degenerate with the pruning of other commits (having fewer "
1914
- "than two parents, having one commit serve as both parents, or "
1915
- "having one parent as the ancestor of the other.) If such "
1916
- "merge commits have no file changes, they can be pruned. The "
1917
- "default ('auto') is to only prune empty merge commits which "
1918
- "become degenerate (not which started as such)."))
1919
- parents.add_argument('--no-ff', action='store_true',
1920
- help=_("Even if the first parent is or becomes an ancestor of another "
1921
- "parent, do not prune it. This modifies how "
1922
- "--prune-degenerate behaves, and may be useful in projects who "
1923
- "always use merge --no-ff."))
1924
-
1925
- callback = parser.add_argument_group(title=_("Generic callback code snippets"))
1926
- callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE",
1927
- help=_("Python code body for processing filenames; see CALLBACKS "
1928
- "sections below."))
1929
- callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE",
1930
- help=_("Python code body for processing messages (both commit "
1931
- "messages and tag messages); see CALLBACKS section below."))
1932
- callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE",
1933
- help=_("Python code body for processing names of people; see "
1934
- "CALLBACKS section below."))
1935
- callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE",
1936
- help=_("Python code body for processing emails addresses; see "
1937
- "CALLBACKS section below."))
1938
- callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE",
1939
- help=_("Python code body for processing refnames; see CALLBACKS "
1940
- "section below."))
1941
-
1942
- callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE",
1943
- help=_("Python code body for processing blob objects; see "
1944
- "CALLBACKS section below."))
1945
- callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE",
1946
- help=_("Python code body for processing commit objects; see "
1947
- "CALLBACKS section below."))
1948
- callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE",
1949
- help=_("Python code body for processing tag objects; see CALLBACKS "
1950
- "section below."))
1951
- callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE",
1952
- help=_("Python code body for processing reset objects; see "
1953
- "CALLBACKS section below."))
1954
-
1955
- desc = _(
1956
- "Specifying alternate source or target locations implies --partial,\n"
1957
- "except that the normal default for --replace-refs is used. However,\n"
1958
- "unlike normal uses of --partial, this doesn't risk mixing old and new\n"
1959
- "history since the old and new histories are in different repositories.")
1960
- location = parser.add_argument_group(title=_("Location to filter from/to"),
1961
- description=desc)
1962
- location.add_argument('--source', type=os.fsencode,
1963
- help=_("Git repository to read from"))
1964
- location.add_argument('--target', type=os.fsencode,
1965
- help=_("Git repository to overwrite with filtered history"))
1966
-
1967
- misc = parser.add_argument_group(title=_("Miscellaneous options"))
1968
- misc.add_argument('--help', '-h', action='store_true',
1969
- help=_("Show this help message and exit."))
1970
- misc.add_argument('--version', action='store_true',
1971
- help=_("Display filter-repo's version and exit."))
1972
- misc.add_argument('--force', '-f', action='store_true',
1973
- help=_("Rewrite repository history even if the current repo does not "
1974
- "look like a fresh clone. History rewriting is irreversible "
1975
- "(and includes immediate pruning of reflogs and old objects), "
1976
- "so be cautious about using this flag."))
1977
- misc.add_argument('--partial', action='store_true',
1978
- help=_("Do a partial history rewrite, resulting in the mixture of "
1979
- "old and new history. This implies a default of "
1980
- "update-no-add for --replace-refs, disables rewriting "
1981
- "refs/remotes/origin/* to refs/heads/*, disables removing "
1982
- "of the 'origin' remote, disables removing unexported refs, "
1983
- "disables expiring the reflog, and disables the automatic "
1984
- "post-filter gc. Also, this modifies --tag-rename and "
1985
- "--refname-callback options such that instead of replacing "
1986
- "old refs with new refnames, it will instead create new "
1987
- "refs and keep the old ones around. Use with caution."))
1988
- # WARNING: --refs presents a problem with become-degenerate pruning:
1989
- # * Excluding a commit also excludes its ancestors so when some other
1990
- # commit has an excluded ancestor as a parent we have no way of
1991
- # knowing what it is an ancestor of without doing a special
1992
- # full-graph walk.
1993
- misc.add_argument('--refs', nargs='+',
1994
- help=_("Limit history rewriting to the specified refs. Implies "
1995
- "--partial. In addition to the normal caveats of --partial "
1996
- "(mixing old and new history, no automatic remapping of "
1997
- "refs/remotes/origin/* to refs/heads/*, etc.), this also may "
1998
- "cause problems for pruning of degenerate empty merge "
1999
- "commits when negative revisions are specified."))
2000
-
2001
- misc.add_argument('--dry-run', action='store_true',
2002
- help=_("Do not change the repository. Run `git fast-export` and "
2003
- "filter its output, and save both the original and the "
2004
- "filtered version for comparison. This also disables "
2005
- "rewriting commit messages due to not knowing new commit "
2006
- "IDs and disables filtering of some empty commits due to "
2007
- "inability to query the fast-import backend." ))
2008
- misc.add_argument('--debug', action='store_true',
2009
- help=_("Print additional information about operations being "
2010
- "performed and commands being run. When used together "
2011
- "with --dry-run, also show extra information about what "
2012
- "would be run."))
2013
- # WARNING: --state-branch has some problems:
2014
- # * It does not work well with manually inserted objects (user creating
2015
- # Blob() or Commit() or Tag() objects and calling
2016
- # RepoFilter.insert(obj) on them).
2017
- # * It does not work well with multiple source or multiple target repos
2018
- # * It doesn't work so well with pruning become-empty commits (though
2019
- # --refs doesn't work so well with it either)
2020
- # These are probably fixable, given some work (e.g. re-importing the
2021
- # graph at the beginning to get the AncestryGraph right, doing our own
2022
- # export of marks instead of using fast-export --export-marks, etc.), but
2023
- # for now just hide the option.
2024
- misc.add_argument('--state-branch',
2025
- #help=_("Enable incremental filtering by saving the mapping of old "
2026
- # "to new objects to the specified branch upon exit, and"
2027
- # "loading that mapping from that branch (if it exists) "
2028
- # "upon startup."))
2029
- help=argparse.SUPPRESS)
2030
- misc.add_argument('--stdin', action='store_true',
2031
- help=_("Instead of running `git fast-export` and filtering its "
2032
- "output, filter the fast-export stream from stdin. The "
2033
- "stdin must be in the expected input format (e.g. it needs "
2034
- "to include original-oid directives)."))
2035
- misc.add_argument('--quiet', action='store_true',
2036
- help=_("Pass --quiet to other git commands called"))
2037
- return parser
2038
-
2039
- @staticmethod
2040
- def sanity_check_args(args):
2041
- if args.analyze and args.path_changes:
2042
- raise SystemExit(_("Error: --analyze is incompatible with --path* flags; "
2043
- "it's a read-only operation."))
2044
- if args.analyze and args.stdin:
2045
- raise SystemExit(_("Error: --analyze is incompatible with --stdin."))
2046
- # If no path_changes are found, initialize with empty list but mark as
2047
- # not inclusive so that all files match
2048
- if args.path_changes == None:
2049
- args.path_changes = []
2050
- args.inclusive = False
2051
- else:
2052
- # Similarly, if we have no filtering paths, then no path should be
2053
- # filtered out. Based on how newname() works, the easiest way to
2054
- # achieve that is setting args.inclusive to False.
2055
- if not any(x[0] == 'filter' for x in args.path_changes):
2056
- args.inclusive = False
2057
- # Also check for incompatible --use-base-name and --path-rename flags.
2058
- if args.use_base_name:
2059
- if any(x[0] == 'rename' for x in args.path_changes):
2060
- raise SystemExit(_("Error: --use-base-name and --path-rename are "
2061
- "incompatible."))
2062
- # Also throw some sanity checks on git version here;
2063
- # PERF: remove these checks once new enough git versions are common
2064
- p = subproc.Popen('git fast-export -h'.split(),
2065
- stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2066
- output = p.stdout.read()
2067
- if b'--anonymize-map' not in output: # pragma: no cover
2068
- global date_format_permissive
2069
- date_format_permissive = False
2070
- if b'--mark-tags' not in output: # pragma: no cover
2071
- global write_marks
2072
- write_marks = False
2073
- if args.state_branch:
2074
- # We need a version of git-fast-export with --mark-tags
2075
- raise SystemExit(_("Error: need git >= 2.24.0"))
2076
- if b'--reencode' not in output: # pragma: no cover
2077
- if args.preserve_commit_encoding:
2078
- # We need a version of git-fast-export with --reencode
2079
- raise SystemExit(_("Error: need git >= 2.23.0"))
2080
- else:
2081
- # Set args.preserve_commit_encoding to None which we'll check for later
2082
- # to avoid passing --reencode=yes to fast-export (that option was the
2083
- # default prior to git-2.23)
2084
- args.preserve_commit_encoding = None
2085
- # If we don't have fast-exoprt --reencode, we may also be missing
2086
- # diff-tree --combined-all-paths, which is even more important...
2087
- p = subproc.Popen('git diff-tree -h'.split(),
2088
- stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2089
- output = p.stdout.read()
2090
- if b'--combined-all-paths' not in output:
2091
- # We need a version of git-diff-tree with --combined-all-paths
2092
- raise SystemExit(_("Error: need git >= 2.22.0"))
2093
- # End of sanity checks on git version
2094
- if args.max_blob_size:
2095
- suffix = args.max_blob_size[-1]
2096
- if suffix not in '1234567890':
2097
- mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
2098
- if suffix not in mult:
2099
- raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than"
2100
- " argument %s")
2101
- % args.max_blob_size)
2102
- args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix]
2103
- else:
2104
- args.max_blob_size = int(args.max_blob_size)
2105
-
2106
- @staticmethod
2107
- def get_replace_text(filename):
2108
- replace_literals = []
2109
- replace_regexes = []
2110
- with open(filename, 'br') as f:
2111
- for line in f:
2112
- line = line.rstrip(b'\r\n')
2113
-
2114
- # Determine the replacement
2115
- replacement = FilteringOptions.default_replace_text
2116
- if b'==>' in line:
2117
- line, replacement = line.rsplit(b'==>', 1)
2118
-
2119
- # See if we need to match via regex
2120
- regex = None
2121
- if line.startswith(b'regex:'):
2122
- regex = line[6:]
2123
- elif line.startswith(b'glob:'):
2124
- regex = glob_to_regex(line[5:])
2125
- if regex:
2126
- replace_regexes.append((re.compile(regex), replacement))
2127
- else:
2128
- # Otherwise, find the literal we need to replace
2129
- if line.startswith(b'literal:'):
2130
- line = line[8:]
2131
- if not line:
2132
- continue
2133
- replace_literals.append((line, replacement))
2134
- return {'literals': replace_literals, 'regexes': replace_regexes}
2135
-
2136
- @staticmethod
2137
- def get_paths_from_file(filename):
2138
- new_path_changes = []
2139
- with open(filename, 'br') as f:
2140
- for line in f:
2141
- line = line.rstrip(b'\r\n')
2142
-
2143
- # Skip blank lines
2144
- if not line:
2145
- continue
2146
- # Skip comment lines
2147
- if line.startswith(b'#'):
2148
- continue
2149
-
2150
- # Determine the replacement
2151
- match_type, repl = 'literal', None
2152
- if b'==>' in line:
2153
- line, repl = line.rsplit(b'==>', 1)
2154
-
2155
- # See if we need to match via regex
2156
- match_type = 'match' # a.k.a. 'literal'
2157
- if line.startswith(b'regex:'):
2158
- match_type = 'regex'
2159
- match = re.compile(line[6:])
2160
- elif line.startswith(b'glob:'):
2161
- match_type = 'glob'
2162
- match = line[5:]
2163
- if repl:
2164
- raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename)))
2165
- else:
2166
- if line.startswith(b'literal:'):
2167
- match = line[8:]
2168
- else:
2169
- match = line
2170
- if repl is not None:
2171
- if match and repl and match.endswith(b'/') != repl.endswith(b'/'):
2172
- raise SystemExit(_("Error: When rename directories, if OLDNAME "
2173
- "and NEW_NAME are both non-empty and either "
2174
- "ends with a slash then both must."))
2175
-
2176
- # Record the filter or rename
2177
- if repl is not None:
2178
- new_path_changes.append(['rename', match_type, (match, repl)])
2179
- else:
2180
- new_path_changes.append(['filter', match_type, match])
2181
- if match_type == 'glob' and not match.endswith(b'*'):
2182
- extension = b'*' if match.endswith(b'/') else b'/*'
2183
- new_path_changes.append(['filter', match_type, match+extension])
2184
- return new_path_changes
2185
-
2186
- @staticmethod
2187
- def default_options():
2188
- return FilteringOptions.parse_args([], error_on_empty = False)
2189
-
2190
- @staticmethod
2191
- def parse_args(input_args, error_on_empty = True):
2192
- parser = FilteringOptions.create_arg_parser()
2193
- if not input_args and error_on_empty:
2194
- parser.print_usage()
2195
- raise SystemExit(_("No arguments specified."))
2196
- args = parser.parse_args(input_args)
2197
- if args.help:
2198
- parser.print_help()
2199
- raise SystemExit()
2200
- if args.version:
2201
- GitUtils.print_my_version()
2202
- raise SystemExit()
2203
- FilteringOptions.sanity_check_args(args)
2204
- if args.mailmap:
2205
- args.mailmap = MailmapInfo(args.mailmap)
2206
- if args.replace_text:
2207
- args.replace_text = FilteringOptions.get_replace_text(args.replace_text)
2208
- if args.replace_message:
2209
- args.replace_message = FilteringOptions.get_replace_text(args.replace_message)
2210
- if args.strip_blobs_with_ids:
2211
- with open(args.strip_blobs_with_ids, 'br') as f:
2212
- args.strip_blobs_with_ids = set(f.read().split())
2213
- else:
2214
- args.strip_blobs_with_ids = set()
2215
- if (args.partial or args.refs) and not args.replace_refs:
2216
- args.replace_refs = 'update-no-add'
2217
- args.repack = not (args.partial or args.refs)
2218
- if args.refs or args.source or args.target:
2219
- args.partial = True
2220
- if not args.refs:
2221
- args.refs = ['--all']
2222
- return args
2223
-
2224
- class RepoAnalyze(object):
2225
-
2226
- # First, several helper functions for analyze_commit()
2227
-
2228
- @staticmethod
2229
- def equiv_class(stats, filename):
2230
- return stats['equivalence'].get(filename, (filename,))
2231
-
2232
- @staticmethod
2233
- def setup_equivalence_for_rename(stats, oldname, newname):
2234
- # if A is renamed to B and B is renamed to C, then the user thinks of
2235
- # A, B, and C as all being different names for the same 'file'. We record
2236
- # this as an equivalence class:
2237
- # stats['equivalence'][name] = (A,B,C)
2238
- # for name being each of A, B, and C.
2239
- old_tuple = stats['equivalence'].get(oldname, ())
2240
- if newname in old_tuple:
2241
- return
2242
- elif old_tuple:
2243
- new_tuple = tuple(list(old_tuple)+[newname])
2244
- else:
2245
- new_tuple = (oldname, newname)
2246
- for f in new_tuple:
2247
- stats['equivalence'][f] = new_tuple
2248
-
2249
- @staticmethod
2250
- def setup_or_update_rename_history(stats, commit, oldname, newname):
2251
- rename_commits = stats['rename_history'].get(oldname, set())
2252
- rename_commits.add(commit)
2253
- stats['rename_history'][oldname] = rename_commits
2254
-
2255
- @staticmethod
2256
- def handle_renames(stats, commit, change_types, filenames):
2257
- for index, change_type in enumerate(change_types):
2258
- if change_type == ord(b'R'):
2259
- oldname, newname = filenames[index], filenames[-1]
2260
- RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
2261
- RepoAnalyze.setup_or_update_rename_history(stats, commit,
2262
- oldname, newname)
2263
-
2264
- @staticmethod
2265
- def handle_file(stats, graph, commit, modes, shas, filenames):
2266
- mode, sha, filename = modes[-1], shas[-1], filenames[-1]
2267
-
2268
- # Figure out kind of deletions to undo for this file, and update lists
2269
- # of all-names-by-sha and all-filenames
2270
- delmode = 'tree_deletions'
2271
- if mode != b'040000':
2272
- delmode = 'file_deletions'
2273
- stats['names'][sha].add(filename)
2274
- stats['allnames'].add(filename)
2275
-
2276
- # If the file (or equivalence class of files) was recorded as deleted,
2277
- # clearly it isn't anymore
2278
- equiv = RepoAnalyze.equiv_class(stats, filename)
2279
- for f in equiv:
2280
- stats[delmode].pop(f, None)
2281
-
2282
- # If we get a modify/add for a path that was renamed, we may need to break
2283
- # the equivalence class. However, if the modify/add was on a branch that
2284
- # doesn't have the rename in its history, we are still okay.
2285
- need_to_break_equivalence = False
2286
- if equiv[-1] != filename:
2287
- for rename_commit in stats['rename_history'][filename]:
2288
- if graph.is_ancestor(rename_commit, commit):
2289
- need_to_break_equivalence = True
2290
-
2291
- if need_to_break_equivalence:
2292
- for f in equiv:
2293
- if f in stats['equivalence']:
2294
- del stats['equivalence'][f]
2295
-
2296
- @staticmethod
2297
- def analyze_commit(stats, graph, commit, parents, date, file_changes):
2298
- graph.add_commit_and_parents(commit, parents)
2299
- for change in file_changes:
2300
- modes, shas, change_types, filenames = change
2301
- if len(parents) == 1 and change_types.startswith(b'R'):
2302
- change_types = b'R' # remove the rename score; we don't care
2303
- if modes[-1] == b'160000':
2304
- continue
2305
- elif modes[-1] == b'000000':
2306
- # Track when files/directories are deleted
2307
- for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
2308
- if any(x == b'040000' for x in modes[0:-1]):
2309
- stats['tree_deletions'][f] = date
2310
- else:
2311
- stats['file_deletions'][f] = date
2312
- elif change_types.strip(b'AMT') == b'':
2313
- RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2314
- elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'':
2315
- RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2316
- elif change_types.strip(b'RAMT') == b'':
2317
- RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2318
- RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
2319
- else:
2320
- raise SystemExit(_("Unhandled change type(s): %(change_type)s "
2321
- "(in commit %(commit)s)")
2322
- % ({'change_type': change_types, 'commit': commit})
2323
- ) # pragma: no cover
2324
-
2325
- @staticmethod
2326
- def gather_data(args):
2327
- unpacked_size, packed_size = GitUtils.get_blob_sizes()
2328
- stats = {'names': collections.defaultdict(set),
2329
- 'allnames' : set(),
2330
- 'file_deletions': {},
2331
- 'tree_deletions': {},
2332
- 'equivalence': {},
2333
- 'rename_history': collections.defaultdict(set),
2334
- 'unpacked_size': unpacked_size,
2335
- 'packed_size': packed_size,
2336
- 'num_commits': 0}
2337
-
2338
- # Setup the rev-list/diff-tree process
2339
- processed_commits_msg = _("Processed %d commits")
2340
- commit_parse_progress = ProgressWriter()
2341
- num_commits = 0
2342
- cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
2343
- ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
2344
- ' --date=short -M -t -c --raw --combined-all-paths')
2345
- dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
2346
- f = dtp.stdout
2347
- line = f.readline()
2348
- if not line:
2349
- raise SystemExit(_("Nothing to analyze; repository is empty."))
2350
- cont = bool(line)
2351
- graph = AncestryGraph()
2352
- while cont:
2353
- commit = line.rstrip()
2354
- parents = f.readline().split()
2355
- date = f.readline().rstrip()
2356
-
2357
- # We expect a blank line next; if we get a non-blank line then
2358
- # this commit modified no files and we need to move on to the next.
2359
- # If there is no line, we've reached end-of-input.
2360
- line = f.readline()
2361
- if not line:
2362
- cont = False
2363
- line = line.rstrip()
2364
-
2365
- # If we haven't reached end of input, and we got a blank line meaning
2366
- # a commit that has modified files, then get the file changes associated
2367
- # with this commit.
2368
- file_changes = []
2369
- if cont and not line:
2370
- cont = False
2371
- for line in f:
2372
- if not line.startswith(b':'):
2373
- cont = True
2374
- break
2375
- n = 1+max(1, len(parents))
2376
- assert line.startswith(b':'*(n-1))
2377
- relevant = line[n-1:-1]
2378
- splits = relevant.split(None, n)
2379
- modes = splits[0:n]
2380
- splits = splits[n].split(None, n)
2381
- shas = splits[0:n]
2382
- splits = splits[n].split(b'\t')
2383
- change_types = splits[0]
2384
- filenames = [PathQuoting.dequote(x) for x in splits[1:]]
2385
- file_changes.append([modes, shas, change_types, filenames])
2386
-
2387
- # If someone is trying to analyze a subset of the history, make sure
2388
- # to avoid dying on commits with parents that we haven't seen before
2389
- if args.refs:
2390
- graph.record_external_commits([p for p in parents
2391
- if not p in graph.value])
2392
-
2393
- # Analyze this commit and update progress
2394
- RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
2395
- file_changes)
2396
- num_commits += 1
2397
- commit_parse_progress.show(processed_commits_msg % num_commits)
2398
-
2399
- # Show the final commits processed message and record the number of commits
2400
- commit_parse_progress.finish()
2401
- stats['num_commits'] = num_commits
2402
-
2403
- # Close the output, ensure rev-list|diff-tree pipeline completed successfully
2404
- dtp.stdout.close()
2405
- if dtp.wait():
2406
- raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover
2407
-
2408
- return stats
2409
-
2410
- @staticmethod
2411
- def write_report(reportdir, stats):
2412
- def datestr(datetimestr):
2413
- return datetimestr if datetimestr else _('<present>').encode()
2414
-
2415
- def dirnames(path):
2416
- while True:
2417
- path = os.path.dirname(path)
2418
- yield path
2419
- if path == b'':
2420
- break
2421
-
2422
- # Compute aggregate size information for paths, extensions, and dirs
2423
- total_size = {'packed': 0, 'unpacked': 0}
2424
- path_size = {'packed': collections.defaultdict(int),
2425
- 'unpacked': collections.defaultdict(int)}
2426
- ext_size = {'packed': collections.defaultdict(int),
2427
- 'unpacked': collections.defaultdict(int)}
2428
- dir_size = {'packed': collections.defaultdict(int),
2429
- 'unpacked': collections.defaultdict(int)}
2430
- for sha in stats['names']:
2431
- size = {'packed': stats['packed_size'][sha],
2432
- 'unpacked': stats['unpacked_size'][sha]}
2433
- for which in ('packed', 'unpacked'):
2434
- for name in stats['names'][sha]:
2435
- total_size[which] += size[which]
2436
- path_size[which][name] += size[which]
2437
- basename, ext = os.path.splitext(name)
2438
- ext_size[which][ext] += size[which]
2439
- for dirname in dirnames(name):
2440
- dir_size[which][dirname] += size[which]
2441
-
2442
- # Determine if and when extensions and directories were deleted
2443
- ext_deleted_data = {}
2444
- for name in stats['allnames']:
2445
- when = stats['file_deletions'].get(name, None)
2446
-
2447
- # Update the extension
2448
- basename, ext = os.path.splitext(name)
2449
- if when is None:
2450
- ext_deleted_data[ext] = None
2451
- elif ext in ext_deleted_data:
2452
- if ext_deleted_data[ext] is not None:
2453
- ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
2454
- else:
2455
- ext_deleted_data[ext] = when
2456
-
2457
- dir_deleted_data = {}
2458
- for name in dir_size['packed']:
2459
- dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
2460
-
2461
- with open(os.path.join(reportdir, b"README"), 'bw') as f:
2462
- # Give a basic overview of this file
2463
- f.write(b"== %s ==\n" % _("Overall Statistics").encode())
2464
- f.write((" %s: %d\n" % (_("Number of commits"),
2465
- stats['num_commits'])).encode())
2466
- f.write((" %s: %d\n" % (_("Number of filenames"),
2467
- len(path_size['packed']))).encode())
2468
- f.write((" %s: %d\n" % (_("Number of directories"),
2469
- len(dir_size['packed']))).encode())
2470
- f.write((" %s: %d\n" % (_("Number of file extensions"),
2471
- len(ext_size['packed']))).encode())
2472
- f.write(b"\n")
2473
- f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"),
2474
- total_size['unpacked'])).encode())
2475
- f.write((" %s: %d\n" % (_("Total packed size (bytes)"),
2476
- total_size['packed'])).encode())
2477
- f.write(b"\n")
2478
-
2479
- # Mention issues with the report
2480
- f.write(("== %s ==\n" % _("Caveats")).encode())
2481
- f.write(("=== %s ===\n" % _("Sizes")).encode())
2482
- f.write(textwrap.dedent(_("""
2483
- Packed size represents what size your repository would be if no
2484
- trees, commits, tags, or other metadata were included (though it may
2485
- fail to represent de-duplication; see below). It also represents the
2486
- current packing, which may be suboptimal if you haven't gc'ed for a
2487
- while.
2488
-
2489
- Unpacked size represents what size your repository would be if no
2490
- trees, commits, tags, or other metadata were included AND if no
2491
- files were packed; i.e., without delta-ing or compression.
2492
-
2493
- Both unpacked and packed sizes can be slightly misleading. Deleting
2494
- a blob from history not save as much space as the unpacked size,
2495
- because it is obviously normally stored in packed form. Also,
2496
- deleting a blob from history may not save as much space as its packed
2497
- size either, because another blob could be stored as a delta against
2498
- that blob, so when you remove one blob another blob's packed size may
2499
- grow.
2500
-
2501
- Also, the sum of the packed sizes can add up to more than the
2502
- repository size; if the same contents appeared in the repository in
2503
- multiple places, git will automatically de-dupe and store only one
2504
- copy, while the way sizes are added in this analysis adds the size
2505
- for each file path that has those contents. Further, if a file is
2506
- ever reverted to a previous version's contents, the previous
2507
- version's size will be counted multiple times in this analysis, even
2508
- though git will only store it once.
2509
- """)[1:]).encode())
2510
- f.write(b"\n")
2511
- f.write(("=== %s ===\n" % _("Deletions")).encode())
2512
- f.write(textwrap.dedent(_("""
2513
- Whether a file is deleted is not a binary quality, since it can be
2514
- deleted on some branches but still exist in others. Also, it might
2515
- exist in an old tag, but have been deleted in versions newer than
2516
- that. More thorough tracking could be done, including looking at
2517
- merge commits where one side of history deleted and the other modified,
2518
- in order to give a more holistic picture of deletions. However, that
2519
- algorithm would not only be more complex to implement, it'd also be
2520
- quite difficult to present and interpret by users. Since --analyze
2521
- is just about getting a high-level rough picture of history, it instead
2522
- implements the simplistic rule that is good enough for 98% of cases:
2523
- A file is marked as deleted if the last commit in the fast-export
2524
- stream that mentions the file lists it as deleted.
2525
- This makes it dependent on topological ordering, but generally gives
2526
- the "right" answer.
2527
- """)[1:]).encode())
2528
- f.write(b"\n")
2529
- f.write(("=== %s ===\n" % _("Renames")).encode())
2530
- f.write(textwrap.dedent(_("""
2531
- Renames share the same non-binary nature that deletions do, plus
2532
- additional challenges:
2533
- * If the renamed file is renamed again, instead of just two names for
2534
- a path you can have three or more.
2535
- * Rename pairs of the form (oldname, newname) that we consider to be
2536
- different names of the "same file" might only be valid over certain
2537
- commit ranges. For example, if a new commit reintroduces a file
2538
- named oldname, then new versions of oldname aren't the "same file"
2539
- anymore. We could try to portray this to the user, but it's easier
2540
- for the user to just break the pairing and only report unbroken
2541
- rename pairings to the user.
2542
- * The ability for users to rename files differently in different
2543
- branches means that our chains of renames will not necessarily be
2544
- linear but may branch out.
2545
- """)[1:]).encode())
2546
- f.write(b"\n")
2547
-
2548
- # Equivalence classes for names, so if folks only want to keep a
2549
- # certain set of paths, they know the old names they want to include
2550
- # too.
2551
- with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f:
2552
- seen = set()
2553
- for pathname,equiv_group in sorted(stats['equivalence'].items(),
2554
- key=lambda x:(x[1], x[0])):
2555
- if equiv_group in seen:
2556
- continue
2557
- seen.add(equiv_group)
2558
- f.write(("{} ->\n ".format(decode(equiv_group[0])) +
2559
- "\n ".join(decode(x) for x in equiv_group[1:]) +
2560
- "\n").encode())
2561
-
2562
- # List directories in reverse sorted order of unpacked size
2563
- with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f:
2564
- msg = "=== %s ===\n" % _("Deleted directories by reverse size")
2565
- f.write(msg.encode())
2566
- msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
2567
- f.write(msg.encode())
2568
- for dirname, size in sorted(dir_size['packed'].items(),
2569
- key=lambda x:(x[1],x[0]), reverse=True):
2570
- if (dir_deleted_data[dirname]):
2571
- f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
2572
- size,
2573
- datestr(dir_deleted_data[dirname]),
2574
- dirname or _('<toplevel>').encode()))
2575
-
2576
- with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f:
2577
- f.write(("=== %s ===\n" % _("All directories by reverse size")).encode())
2578
- msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
2579
- f.write(msg.encode())
2580
- for dirname, size in sorted(dir_size['packed'].items(),
2581
- key=lambda x:(x[1],x[0]), reverse=True):
2582
- f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
2583
- size,
2584
- datestr(dir_deleted_data[dirname]),
2585
- dirname or _("<toplevel>").encode()))
2586
-
2587
- # List extensions in reverse sorted order of unpacked size
2588
- with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f:
2589
- msg = "=== %s ===\n" % _("Deleted extensions by reverse size")
2590
- f.write(msg.encode())
2591
- msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
2592
- f.write(msg.encode())
2593
- for extname, size in sorted(ext_size['packed'].items(),
2594
- key=lambda x:(x[1],x[0]), reverse=True):
2595
- if (ext_deleted_data[extname]):
2596
- f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
2597
- size,
2598
- datestr(ext_deleted_data[extname]),
2599
- extname or _('<no extension>').encode()))
2600
-
2601
- with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f:
2602
- f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode())
2603
- msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
2604
- f.write(msg.encode())
2605
- for extname, size in sorted(ext_size['packed'].items(),
2606
- key=lambda x:(x[1],x[0]), reverse=True):
2607
- f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
2608
- size,
2609
- datestr(ext_deleted_data[extname]),
2610
- extname or _('<no extension>').encode()))
2611
-
2612
- # List files in reverse sorted order of unpacked size
2613
- with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f:
2614
- msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size")
2615
- f.write(msg.encode())
2616
- msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n")
2617
- f.write(msg.encode())
2618
- for pathname, size in sorted(path_size['packed'].items(),
2619
- key=lambda x:(x[1],x[0]), reverse=True):
2620
- when = stats['file_deletions'].get(pathname, None)
2621
- if when:
2622
- f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
2623
- size,
2624
- datestr(when),
2625
- pathname))
2626
-
2627
- with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f:
2628
- msg = "=== %s ===\n" % _("All paths by reverse accumulated size")
2629
- f.write(msg.encode())
2630
- msg = _("Format: unpacked size, packed size, date deleted, path name\n")
2631
- f.write(msg.encode())
2632
- for pathname, size in sorted(path_size['packed'].items(),
2633
- key=lambda x:(x[1],x[0]), reverse=True):
2634
- when = stats['file_deletions'].get(pathname, None)
2635
- f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
2636
- size,
2637
- datestr(when),
2638
- pathname))
2639
-
2640
- # List of filenames and sizes in descending order
2641
- with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f:
2642
- f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode())
2643
- f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode())
2644
- for sha, size in sorted(stats['packed_size'].items(),
2645
- key=lambda x:(x[1],x[0]), reverse=True):
2646
- if sha not in stats['names']:
2647
- # Some objects in the repository might not be referenced, or not
2648
- # referenced by the branches/tags the user cares about; skip them.
2649
- continue
2650
- names_with_sha = stats['names'][sha]
2651
- if len(names_with_sha) == 1:
2652
- names_with_sha = names_with_sha.pop()
2653
- else:
2654
- names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']'
2655
- f.write(b" %s %10d %10d %s\n" % (sha,
2656
- stats['unpacked_size'][sha],
2657
- size,
2658
- names_with_sha))
2659
-
2660
- @staticmethod
2661
- def run(args):
2662
- if args.report_dir:
2663
- reportdir = args.report_dir
2664
- else:
2665
- git_dir = GitUtils.determine_git_dir(b'.')
2666
-
2667
- # Create the report directory as necessary
2668
- results_tmp_dir = os.path.join(git_dir, b'filter-repo')
2669
- if not os.path.isdir(results_tmp_dir):
2670
- os.mkdir(results_tmp_dir)
2671
- reportdir = os.path.join(results_tmp_dir, b"analysis")
2672
-
2673
- if os.path.isdir(reportdir):
2674
- if args.force:
2675
- sys.stdout.write(_("Warning: Removing recursively: \"%s\"") % decode(reportdir))
2676
- shutil.rmtree(reportdir)
2677
- else:
2678
- sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir))
2679
- sys.exit(1)
2680
-
2681
- os.mkdir(reportdir)
2682
-
2683
- # Gather the data we need
2684
- stats = RepoAnalyze.gather_data(args)
2685
-
2686
- # Write the reports
2687
- sys.stdout.write(_("Writing reports to %s...") % decode(reportdir))
2688
- sys.stdout.flush()
2689
- RepoAnalyze.write_report(reportdir, stats)
2690
- sys.stdout.write(_("done.\n"))
2691
-
2692
- class InputFileBackup:
2693
- def __init__(self, input_file, output_file):
2694
- self.input_file = input_file
2695
- self.output_file = output_file
2696
-
2697
- def close(self):
2698
- self.input_file.close()
2699
- self.output_file.close()
2700
-
2701
- def read(self, size):
2702
- output = self.input_file.read(size)
2703
- self.output_file.write(output)
2704
- return output
2705
-
2706
- def readline(self):
2707
- line = self.input_file.readline()
2708
- self.output_file.write(line)
2709
- return line
2710
-
2711
- class DualFileWriter:
2712
- def __init__(self, file1, file2):
2713
- self.file1 = file1
2714
- self.file2 = file2
2715
-
2716
- def write(self, *args):
2717
- self.file1.write(*args)
2718
- self.file2.write(*args)
2719
-
2720
- def flush(self):
2721
- self.file1.flush()
2722
- self.file2.flush()
2723
-
2724
- def close(self):
2725
- self.file1.close()
2726
- self.file2.close()
2727
-
2728
- class RepoFilter(object):
2729
- def __init__(self,
2730
- args,
2731
- filename_callback = None,
2732
- message_callback = None,
2733
- name_callback = None,
2734
- email_callback = None,
2735
- refname_callback = None,
2736
- blob_callback = None,
2737
- commit_callback = None,
2738
- tag_callback = None,
2739
- reset_callback = None,
2740
- done_callback = None):
2741
-
2742
- self._args = args
2743
-
2744
- # Repo we are exporting
2745
- self._repo_working_dir = None
2746
-
2747
- # Store callbacks for acting on objects printed by FastExport
2748
- self._blob_callback = blob_callback
2749
- self._commit_callback = commit_callback
2750
- self._tag_callback = tag_callback
2751
- self._reset_callback = reset_callback
2752
- self._done_callback = done_callback
2753
-
2754
- # Store callbacks for acting on slices of FastExport objects
2755
- self._filename_callback = filename_callback # filenames from commits
2756
- self._message_callback = message_callback # commit OR tag message
2757
- self._name_callback = name_callback # author, committer, tagger
2758
- self._email_callback = email_callback # author, committer, tagger
2759
- self._refname_callback = refname_callback # from commit/tag/reset
2760
- self._handle_arg_callbacks()
2761
-
2762
- # Defaults for input
2763
- self._input = None
2764
- self._fep = None # Fast Export Process
2765
- self._fe_orig = None # Path to where original fast-export output stored
2766
- self._fe_filt = None # Path to where filtered fast-export output stored
2767
- self._parser = None # FastExportParser object we are working with
2768
-
2769
- # Defaults for output
2770
- self._output = None
2771
- self._fip = None # Fast Import Process
2772
- self._import_pipes = None
2773
- self._managed_output = True
2774
-
2775
- # A tuple of (depth, list-of-ancestors). Commits and ancestors are
2776
- # identified by their id (their 'mark' in fast-export or fast-import
2777
- # speak). The depth of a commit is one more than the max depth of any
2778
- # of its ancestors.
2779
- self._graph = AncestryGraph()
2780
- # Another one, for ancestry of commits in the original repo
2781
- self._orig_graph = AncestryGraph()
2782
-
2783
- # Names of files that were tweaked in any commit; such paths could lead
2784
- # to subsequent commits being empty
2785
- self._files_tweaked = set()
2786
-
2787
- # A set of commit hash pairs (oldhash, newhash) which used to be merge
2788
- # commits but due to filtering were turned into non-merge commits.
2789
- # The commits probably have suboptimal commit messages (e.g. "Merge branch
2790
- # next into master").
2791
- self._commits_no_longer_merges = []
2792
-
2793
- # A dict of original_ids to new_ids; filtering commits means getting
2794
- # new commit hash (sha1sums), and we record the mapping both for
2795
- # diagnostic purposes and so we can rewrite commit messages. Note that
2796
- # the new_id can be None rather than a commit hash if the original
2797
- # commit became empty and was pruned or was otherwise dropped.
2798
- self._commit_renames = {}
2799
-
2800
- # A set of original_ids for which we have not yet gotten the
2801
- # new_ids; we use OrderedDict because we need to know the order of
2802
- # insertion, but the values are always ignored (and set to None).
2803
- # If there was an OrderedSet class, I'd use it instead.
2804
- self._pending_renames = collections.OrderedDict()
2805
-
2806
- # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix).
2807
- #
2808
- # It's common for commit messages to refer to commits by abbreviated
2809
- # commit hashes, as short as 7 characters. To facilitate translating
2810
- # such short hashes, we have a mapping of prefixes to full old hashes.
2811
- self._commit_short_old_hashes = collections.defaultdict(set)
2812
-
2813
- # A set of commit hash references appearing in commit messages which
2814
- # mapped to a valid commit that was removed entirely in the filtering
2815
- # process. The commit message will continue to reference the
2816
- # now-missing commit hash, since there was nothing to map it to.
2817
- self._commits_referenced_but_removed = set()
2818
-
2819
- # Progress handling (number of commits parsed, etc.)
2820
- self._progress_writer = ProgressWriter()
2821
- self._num_commits = 0
2822
-
2823
- # Size of blobs in the repo
2824
- self._unpacked_size = {}
2825
-
2826
- # Other vars
2827
- self._sanity_checks_handled = False
2828
- self._finalize_handled = False
2829
- self._orig_refs = None
2830
- self._newnames = {}
2831
-
2832
- # Cache a few message translations for performance reasons
2833
- self._parsed_message = _("Parsed %d commits")
2834
-
2835
- # Compile some regexes and cache those
2836
- self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)')
2837
-
2838
- def _handle_arg_callbacks(self):
2839
- def make_callback(argname, str):
2840
- exec('def callback({}, _do_not_use_this_var = None):\n'.format(argname)+
2841
- ' '+'\n '.join(str.splitlines()), globals())
2842
- return callback #namespace['callback']
2843
- def handle(type):
2844
- callback_field = '_{}_callback'.format(type)
2845
- code_string = getattr(self._args, type+'_callback')
2846
- if code_string:
2847
- if os.path.exists(code_string):
2848
- with open(code_string, 'r', encoding='utf-8') as f:
2849
- code_string = f.read()
2850
- if getattr(self, callback_field):
2851
- raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter "
2852
- "AND pass --%s-callback"
2853
- % (type, type)))
2854
- if 'return ' not in code_string and \
2855
- type not in ('blob', 'commit', 'tag', 'reset'):
2856
- raise SystemExit(_("Error: --%s-callback should have a return statement")
2857
- % type)
2858
- setattr(self, callback_field, make_callback(type, code_string))
2859
- handle('filename')
2860
- handle('message')
2861
- handle('name')
2862
- handle('email')
2863
- handle('refname')
2864
- handle('blob')
2865
- handle('commit')
2866
- handle('tag')
2867
- handle('reset')
2868
-
2869
- def _run_sanity_checks(self):
2870
- self._sanity_checks_handled = True
2871
- if not self._managed_output:
2872
- if not self._args.replace_refs:
2873
- # If not _managed_output we don't want to make extra changes to the
2874
- # repo, so set default to no-op 'update-no-add'
2875
- self._args.replace_refs = 'update-no-add'
2876
- return
2877
-
2878
- if self._args.debug:
2879
- print("[DEBUG] Passed arguments:\n{}".format(self._args))
2880
-
2881
- # Determine basic repository information
2882
- target_working_dir = self._args.target or b'.'
2883
- self._orig_refs = GitUtils.get_refs(target_working_dir)
2884
- is_bare = GitUtils.is_repository_bare(target_working_dir)
2885
-
2886
- # Determine if this is second or later run of filter-repo
2887
- tmp_dir = self.results_tmp_dir(create_if_missing=False)
2888
- already_ran = os.path.isfile(os.path.join(tmp_dir, b'already_ran'))
2889
-
2890
- # Default for --replace-refs
2891
- if not self._args.replace_refs:
2892
- self._args.replace_refs = ('update-or-add' if already_ran
2893
- else 'update-and-add')
2894
-
2895
- # Do sanity checks from the correct directory
2896
- if not self._args.force and not already_ran:
2897
- cwd = os.getcwd()
2898
- os.chdir(target_working_dir)
2899
- RepoFilter.sanity_check(self._orig_refs, is_bare)
2900
- os.chdir(cwd)
2901
-
2902
- @staticmethod
2903
- def sanity_check(refs, is_bare):
2904
- def abort(reason):
2905
- try:
2906
- cmd = 'git config remote.origin.url'
2907
- output = subproc.check_output(cmd.split()).strip()
2908
- except subprocess.CalledProcessError as e:
2909
- output = None
2910
- msg = ""
2911
- if output and os.path.isdir(output):
2912
- msg = _("Note: when cloning local repositories, you need to pass\n"
2913
- " --no-local to git clone to avoid this issue.\n")
2914
- raise SystemExit(
2915
- _("Aborting: Refusing to destructively overwrite repo history since\n"
2916
- "this does not look like a fresh clone.\n"
2917
- " (%s)\n%s"
2918
- "Please operate on a fresh clone instead. If you want to proceed\n"
2919
- "anyway, use --force.") % (reason, msg))
2920
-
2921
- # Make sure repo is fully packed, just like a fresh clone would be.
2922
- # Note that transfer.unpackLimit defaults to 100, meaning that a
2923
- # repository with no packs and less than 100 objects should be considered
2924
- # fully packed.
2925
- output = subproc.check_output('git count-objects -v'.split())
2926
- stats = dict(x.split(b': ') for x in output.splitlines())
2927
- num_packs = int(stats[b'packs'])
2928
- num_loose_objects = int(stats[b'count'])
2929
- if num_packs > 1 or \
2930
- (num_packs == 1 and num_loose_objects > 0) or \
2931
- num_loose_objects >= 100:
2932
- abort(_("expected freshly packed repo"))
2933
-
2934
- # Make sure there is precisely one remote, named "origin"...or that this
2935
- # is a new bare repo with no packs and no remotes
2936
- output = subproc.check_output('git remote'.split()).strip()
2937
- if not (output == b"origin" or (num_packs == 0 and not output)):
2938
- abort(_("expected one remote, origin"))
2939
-
2940
- # Avoid letting people running with weird setups and overwriting GIT_DIR
2941
- # elsewhere
2942
- git_dir = GitUtils.determine_git_dir(b'.')
2943
- if is_bare and git_dir != b'.':
2944
- abort(_("GIT_DIR must be ."))
2945
- elif not is_bare and git_dir != b'.git':
2946
- abort(_("GIT_DIR must be .git"))
2947
-
2948
- # Make sure that all reflogs have precisely one entry
2949
- reflog_dir=os.path.join(git_dir, b'logs')
2950
- for root, dirs, files in os.walk(reflog_dir):
2951
- for filename in files:
2952
- pathname = os.path.join(root, filename)
2953
- with open(pathname, 'br') as f:
2954
- if len(f.read().splitlines()) > 1:
2955
- shortpath = pathname[len(reflog_dir)+1:]
2956
- abort(_("expected at most one entry in the reflog for %s") %
2957
- decode(shortpath))
2958
-
2959
- # Make sure there are no stashed changes
2960
- if b'refs/stash' in refs:
2961
- abort(_("has stashed changes"))
2962
-
2963
- # Do extra checks in non-bare repos
2964
- if not is_bare:
2965
- # Avoid uncommitted, unstaged, or untracked changes
2966
- if subproc.call('git diff --staged --quiet'.split()):
2967
- abort(_("you have uncommitted changes"))
2968
- if subproc.call('git diff --quiet'.split()):
2969
- abort(_("you have unstaged changes"))
2970
- if len(subproc.check_output('git ls-files -o'.split())) > 0:
2971
- abort(_("you have untracked changes"))
2972
-
2973
- # Avoid unpushed changes
2974
- for refname, rev in refs.items():
2975
- if not refname.startswith(b'refs/heads/'):
2976
- continue
2977
- origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/')
2978
- if origin_ref not in refs:
2979
- abort(_('%s exists, but %s not found') % (decode(refname),
2980
- decode(origin_ref)))
2981
- if rev != refs[origin_ref]:
2982
- abort(_('%s does not match %s') % (decode(refname),
2983
- decode(origin_ref)))
2984
-
2985
- # Make sure there is only one worktree
2986
- output = subproc.check_output('git worktree list'.split())
2987
- if len(output.splitlines()) > 1:
2988
- abort(_('you have multiple worktrees'))
2989
-
2990
- @staticmethod
2991
- def cleanup(repo, repack, reset, run_quietly=False, show_debuginfo=False):
2992
- ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now.
2993
- if reset then do a reset --hard. Optionally also curb output if
2994
- run_quietly is True, or go the opposite direction and show extra
2995
- output if show_debuginfo is True. '''
2996
- assert not (run_quietly and show_debuginfo)
2997
-
2998
- if (repack and not run_quietly and not show_debuginfo):
2999
- print(_("Repacking your repo and cleaning out old unneeded objects"))
3000
- quiet_flags = '--quiet' if run_quietly else ''
3001
- cleanup_cmds = []
3002
- if repack:
3003
- cleanup_cmds = ['git reflog expire --expire=now --all'.split(),
3004
- 'git gc {} --prune=now'.format(quiet_flags).split()]
3005
- if reset:
3006
- cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split())
3007
- location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else ''
3008
- for cmd in cleanup_cmds:
3009
- if show_debuginfo:
3010
- print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd)))
3011
- subproc.call(cmd, cwd=repo)
3012
-
3013
- def _get_rename(self, old_hash):
3014
- # If we already know the rename, just return it
3015
- new_hash = self._commit_renames.get(old_hash, None)
3016
- if new_hash:
3017
- return new_hash
3018
-
3019
- # If it's not in the remaining pending renames, we don't know it
3020
- if old_hash is not None and old_hash not in self._pending_renames:
3021
- return None
3022
-
3023
- # Read through the pending renames until we find it or we've read them all,
3024
- # and return whatever we might find
3025
- self._flush_renames(old_hash)
3026
- return self._commit_renames.get(old_hash, None)
3027
-
3028
- def _flush_renames(self, old_hash=None, limit=0):
3029
- # Parse through self._pending_renames until we have read enough. We have
3030
- # read enough if:
3031
- # self._pending_renames is empty
3032
- # old_hash != None and we found a rename for old_hash
3033
- # limit > 0 and len(self._pending_renames) started less than 2*limit
3034
- # limit > 0 and len(self._pending_renames) < limit
3035
- if limit and len(self._pending_renames) < 2 * limit:
3036
- return
3037
- fi_input, fi_output = self._import_pipes
3038
- while self._pending_renames:
3039
- orig_id, ignore = self._pending_renames.popitem(last=False)
3040
- new_id = fi_output.readline().rstrip()
3041
- self._commit_renames[orig_id] = new_id
3042
- if old_hash == orig_id:
3043
- return
3044
- if limit and len(self._pending_renames) < limit:
3045
- return
3046
-
3047
- def _translate_commit_hash(self, matchobj_or_oldhash):
3048
- old_hash = matchobj_or_oldhash
3049
- if not isinstance(matchobj_or_oldhash, bytes):
3050
- old_hash = matchobj_or_oldhash.group(1)
3051
- orig_len = len(old_hash)
3052
- new_hash = self._get_rename(old_hash)
3053
- if new_hash is None:
3054
- if old_hash[0:7] not in self._commit_short_old_hashes:
3055
- self._commits_referenced_but_removed.add(old_hash)
3056
- return old_hash
3057
- possibilities = self._commit_short_old_hashes[old_hash[0:7]]
3058
- matches = [x for x in possibilities
3059
- if x[0:orig_len] == old_hash]
3060
- if len(matches) != 1:
3061
- self._commits_referenced_but_removed.add(old_hash)
3062
- return old_hash
3063
- old_hash = matches[0]
3064
- new_hash = self._get_rename(old_hash)
3065
-
3066
- assert new_hash is not None
3067
- return new_hash[0:orig_len]
3068
-
3069
- def _trim_extra_parents(self, orig_parents, parents):
3070
- '''Due to pruning of empty commits, some parents could be non-existent
3071
- (None) or otherwise redundant. Remove the non-existent parents, and
3072
- remove redundant parents so long as that doesn't transform a merge
3073
- commit into a non-merge commit.
3074
-
3075
- Returns a tuple:
3076
- (parents, new_first_parent_if_would_become_non_merge)'''
3077
-
3078
- always_prune = (self._args.prune_degenerate == 'always')
3079
-
3080
- # Pruning of empty commits means multiple things:
3081
- # * An original parent of this commit may have been pruned causing the
3082
- # need to rewrite the reported parent to the nearest ancestor. We
3083
- # want to know when we're dealing with such a parent.
3084
- # * Further, there may be no "nearest ancestor" if the entire history
3085
- # of that parent was also pruned. (Detectable by the parent being
3086
- # 'None')
3087
- # Remove all parents rewritten to None, and keep track of which parents
3088
- # were rewritten to an ancestor.
3089
- tmp = zip(parents,
3090
- orig_parents,
3091
- [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents])
3092
- tmp2 = [x for x in tmp if x[0] is not None]
3093
- if not tmp2:
3094
- # All ancestors have been pruned; we have no parents.
3095
- return [], None
3096
- parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)]
3097
-
3098
- # We can't have redundant parents if we don't have at least 2 parents
3099
- if len(parents) < 2:
3100
- return parents, None
3101
-
3102
- # Don't remove redundant parents if user doesn't want us to
3103
- if self._args.prune_degenerate == 'never':
3104
- return parents, None
3105
-
3106
- # Remove duplicate parents (if both sides of history have lots of commits
3107
- # which become empty due to pruning, the most recent ancestor on both
3108
- # sides may be the same commit), except only remove parents that have
3109
- # been rewritten due to previous empty pruning.
3110
- seen = set()
3111
- seen_add = seen.add
3112
- # Deleting duplicate rewritten parents means keeping parents if either
3113
- # they have not been seen or they are ones that have not been rewritten.
3114
- parents_copy = parents
3115
- uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents)
3116
- if not (p in seen or seen_add(p)) or not is_rewritten[i]]
3117
- parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)]
3118
- if len(parents) < 2:
3119
- return parents_copy, parents[0]
3120
-
3121
- # Flatten unnecessary merges. (If one side of history is entirely
3122
- # empty commits that were pruned, we may end up attempting to
3123
- # merge a commit with its ancestor. Remove parents that are an
3124
- # ancestor of another parent.)
3125
- num_parents = len(parents)
3126
- to_remove = []
3127
- for cur in range(num_parents):
3128
- if not is_rewritten[cur]:
3129
- continue
3130
- for other in range(num_parents):
3131
- if cur == other:
3132
- continue
3133
- if not self._graph.is_ancestor(parents[cur], parents[other]):
3134
- continue
3135
- # parents[cur] is an ancestor of parents[other], so parents[cur]
3136
- # seems redundant. However, if it was intentionally redundant
3137
- # (e.g. a no-ff merge) in the original, then we want to keep it.
3138
- if not always_prune and \
3139
- self._orig_graph.is_ancestor(orig_parents[cur],
3140
- orig_parents[other]):
3141
- continue
3142
- # Some folks want their history to have all first parents be merge
3143
- # commits (except for any root commits), and always do a merge --no-ff.
3144
- # For such folks, don't remove the first parent even if it's an
3145
- # ancestor of other commits.
3146
- if self._args.no_ff and cur == 0:
3147
- continue
3148
- # Okay so the cur-th parent is an ancestor of the other-th parent,
3149
- # and it wasn't that way in the original repository; mark the
3150
- # cur-th parent as removable.
3151
- to_remove.append(cur)
3152
- break # cur removed, so skip rest of others -- i.e. check cur+=1
3153
- for x in reversed(to_remove):
3154
- parents.pop(x)
3155
- if len(parents) < 2:
3156
- return parents_copy, parents[0]
3157
-
3158
- return parents, None
3159
-
3160
- def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents):
3161
- parents = commit.parents
3162
-
3163
- if self._args.prune_empty == 'never':
3164
- return False
3165
- always_prune = (self._args.prune_empty == 'always')
3166
-
3167
- # For merge commits, unless there are prunable (redundant) parents, we
3168
- # do not want to prune
3169
- if len(parents) >= 2 and not new_1st_parent:
3170
- return False
3171
-
3172
- if len(parents) < 2:
3173
- # Special logic for commits that started empty...
3174
- if not had_file_changes and not always_prune:
3175
- had_parents_pruned = (len(parents) < len(orig_parents) or
3176
- (len(orig_parents) == 1 and
3177
- orig_parents[0] in _SKIPPED_COMMITS))
3178
- # If the commit remains empty and had parents which were pruned,
3179
- # then prune this commit; otherwise, retain it
3180
- return (not commit.file_changes and had_parents_pruned)
3181
-
3182
- # We can only get here if the commit didn't start empty, so if it's
3183
- # empty now, it obviously became empty
3184
- if not commit.file_changes:
3185
- return True
3186
-
3187
- # If there are no parents of this commit and we didn't match the case
3188
- # above, then this commit cannot be pruned. Since we have no parent(s)
3189
- # to compare to, abort now to prevent future checks from failing.
3190
- if not parents:
3191
- return False
3192
-
3193
- # Similarly, we cannot handle the hard cases if we don't have a pipe
3194
- # to communicate with fast-import
3195
- if not self._import_pipes:
3196
- return False
3197
-
3198
- # If there have not been renames/remappings of IDs (due to insertion of
3199
- # new blobs), then we can sometimes know things aren't prunable with a
3200
- # simple check
3201
- if not _IDS.has_renames():
3202
- # non-merge commits can only be empty if blob/file-change editing caused
3203
- # all file changes in the commit to have the same file contents as
3204
- # the parent.
3205
- changed_files = set(change.filename for change in commit.file_changes)
3206
- if len(orig_parents) < 2 and changed_files - self._files_tweaked:
3207
- return False
3208
-
3209
- # Finally, the hard case: due to either blob rewriting, or due to pruning
3210
- # of empty commits wiping out the first parent history back to the merge
3211
- # base, the list of file_changes we have may not actually differ from our
3212
- # (new) first parent's version of the files, i.e. this would actually be
3213
- # an empty commit. Check by comparing the contents of this commit to its
3214
- # (remaining) parent.
3215
- #
3216
- # NOTE on why this works, for the case of original first parent history
3217
- # having been pruned away due to being empty:
3218
- # The first parent history having been pruned away due to being
3219
- # empty implies the original first parent would have a tree (after
3220
- # filtering) that matched the merge base's tree. Since
3221
- # file_changes has the changes needed to go from what would have
3222
- # been the first parent to our new commit, and what would have been
3223
- # our first parent has a tree that matches the merge base, then if
3224
- # the new first parent has a tree matching the versions of files in
3225
- # file_changes, then this new commit is empty and thus prunable.
3226
- fi_input, fi_output = self._import_pipes
3227
- self._flush_renames() # Avoid fi_output having other stuff present
3228
- # Optimization note: we could have two loops over file_changes, the
3229
- # first doing all the self._output.write() calls, and the second doing
3230
- # the rest. But I'm worried about fast-import blocking on fi_output
3231
- # buffers filling up so I instead read from it as I go.
3232
- for change in commit.file_changes:
3233
- parent = new_1st_parent or commit.parents[0] # exists due to above checks
3234
- quoted_filename = PathQuoting.enquote(change.filename)
3235
- if isinstance(parent, int):
3236
- self._output.write(b"ls :%d %s\n" % (parent, quoted_filename))
3237
- else:
3238
- self._output.write(b"ls %s %s\n" % (parent, quoted_filename))
3239
- self._output.flush()
3240
- parent_version = fi_output.readline().split()
3241
- if change.type == b'D':
3242
- if parent_version != [b'missing', quoted_filename]:
3243
- return False
3244
- else:
3245
- blob_sha = change.blob_id
3246
- if isinstance(change.blob_id, int):
3247
- self._output.write(b"get-mark :%d\n" % change.blob_id)
3248
- self._output.flush()
3249
- blob_sha = fi_output.readline().rstrip()
3250
- if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]:
3251
- return False
3252
-
3253
- return True
3254
-
3255
- def _record_remapping(self, commit, orig_parents):
3256
- new_id = None
3257
- # Record the mapping of old commit hash to new one
3258
- if commit.original_id and self._import_pipes:
3259
- fi_input, fi_output = self._import_pipes
3260
- self._output.write(b"get-mark :%d\n" % commit.id)
3261
- self._output.flush()
3262
- orig_id = commit.original_id
3263
- self._commit_short_old_hashes[orig_id[0:7]].add(orig_id)
3264
- # Note that we have queued up an id for later reading; flush a
3265
- # few of the older ones if we have too many queued up
3266
- self._pending_renames[orig_id] = None
3267
- self._flush_renames(None, limit=40)
3268
- # Also, record if this was a merge commit that turned into a non-merge
3269
- # commit.
3270
- if len(orig_parents) >= 2 and len(commit.parents) < 2:
3271
- self._commits_no_longer_merges.append((commit.original_id, new_id))
3272
-
3273
- def callback_metadata(self, extra_items = dict()):
3274
- return {'commit_rename_func': self._translate_commit_hash,
3275
- 'ancestry_graph': self._graph,
3276
- 'original_ancestry_graph': self._orig_graph,
3277
- **extra_items}
3278
-
3279
- def _tweak_blob(self, blob):
3280
- if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size:
3281
- blob.skip()
3282
-
3283
- if blob.original_id in self._args.strip_blobs_with_ids:
3284
- blob.skip()
3285
-
3286
- if ( self._args.replace_text
3287
- # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data)
3288
- and not b"\0" in blob.data[0:8192]
3289
- ):
3290
- for literal, replacement in self._args.replace_text['literals']:
3291
- blob.data = blob.data.replace(literal, replacement)
3292
- for regex, replacement in self._args.replace_text['regexes']:
3293
- blob.data = regex.sub(replacement, blob.data)
3294
-
3295
- if self._blob_callback:
3296
- self._blob_callback(blob, self.callback_metadata())
3297
-
3298
- def _filter_files(self, commit):
3299
- def filename_matches(path_expression, pathname):
3300
- ''' Returns whether path_expression matches pathname or a leading
3301
- directory thereof, allowing path_expression to not have a trailing
3302
- slash even if it is meant to match a leading directory. '''
3303
- if path_expression == b'':
3304
- return True
3305
- n = len(path_expression)
3306
- if (pathname.startswith(path_expression) and
3307
- (path_expression[n-1:n] == b'/' or
3308
- len(pathname) == n or
3309
- pathname[n:n+1] == b'/')):
3310
- return True
3311
- return False
3312
-
3313
- def newname(path_changes, pathname, use_base_name, filtering_is_inclusive):
3314
- ''' Applies filtering and rename changes from path_changes to pathname,
3315
- returning any of None (file isn't wanted), original filename (file
3316
- is wanted with original name), or new filename. '''
3317
- wanted = False
3318
- full_pathname = pathname
3319
- if use_base_name:
3320
- pathname = os.path.basename(pathname)
3321
- for (mod_type, match_type, path_exp) in path_changes:
3322
- if mod_type == 'filter' and not wanted:
3323
- assert match_type in ('match', 'glob', 'regex')
3324
- if match_type == 'match' and filename_matches(path_exp, pathname):
3325
- wanted = True
3326
- if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp):
3327
- wanted = True
3328
- if match_type == 'regex' and path_exp.search(pathname):
3329
- wanted = True
3330
- elif mod_type == 'rename':
3331
- match, repl = path_exp
3332
- assert match_type in ('match','regex') # glob was translated to regex
3333
- if match_type == 'match' and filename_matches(match, full_pathname):
3334
- full_pathname = full_pathname.replace(match, repl, 1)
3335
- if match_type == 'regex':
3336
- full_pathname = match.sub(repl, full_pathname)
3337
- return full_pathname if (wanted == filtering_is_inclusive) else None
3338
-
3339
- args = self._args
3340
- new_file_changes = {} # Assumes no renames or copies, otherwise collisions
3341
- for change in commit.file_changes:
3342
- # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and
3343
- # parse that output, we'll need to modify this block; `--full-tree`
3344
- # issues a deleteall directive which has no filename, and thus this
3345
- # block would normally strip it. Of course, FileChange() and
3346
- # _parse_optional_filechange() would need updates too.
3347
- if change.type == b'DELETEALL':
3348
- new_file_changes[b''] = change
3349
- continue
3350
- if change.filename in self._newnames:
3351
- change.filename = self._newnames[change.filename]
3352
- else:
3353
- original_filename = change.filename
3354
- change.filename = newname(args.path_changes, change.filename,
3355
- args.use_base_name, args.inclusive)
3356
- if self._filename_callback:
3357
- change.filename = self._filename_callback(change.filename)
3358
- self._newnames[original_filename] = change.filename
3359
- if not change.filename:
3360
- continue # Filtering criteria excluded this file; move on to next one
3361
- if change.filename in new_file_changes:
3362
- # Getting here means that path renaming is in effect, and caused one
3363
- # path to collide with another. That's usually bad, but can be okay
3364
- # under two circumstances:
3365
- # 1) Sometimes people have a file named OLDFILE in old revisions of
3366
- # history, and they rename to NEWFILE, and would like to rewrite
3367
- # history so that all revisions refer to it as NEWFILE. As such,
3368
- # we can allow a collision when (at least) one of the two paths
3369
- # is a deletion. Note that if OLDFILE and NEWFILE are unrelated
3370
- # this also allows the rewrite to continue, which makes sense
3371
- # since OLDFILE is no longer in the way.
3372
- # 2) If OLDFILE and NEWFILE are exactly equal, then writing them
3373
- # both to the same location poses no problem; we only need one
3374
- # file. (This could come up if someone copied a file in some
3375
- # commit, then later either deleted the file or kept it exactly
3376
- # in sync with the original with any changes, and then decides
3377
- # they want to rewrite history to only have one of the two files)
3378
- colliding_change = new_file_changes[change.filename]
3379
- if change.type == b'D':
3380
- # We can just throw this one away and keep the other
3381
- continue
3382
- elif change.type == b'M' and (
3383
- change.mode == colliding_change.mode and
3384
- change.blob_id == colliding_change.blob_id):
3385
- # The two are identical, so we can throw this one away and keep other
3386
- continue
3387
- elif new_file_changes[change.filename].type != b'D':
3388
- raise SystemExit(_("File renaming caused colliding pathnames!\n") +
3389
- _(" Commit: {}\n").format(commit.original_id) +
3390
- _(" Filename: {}").format(change.filename))
3391
- # Strip files that are too large
3392
- if self._args.max_blob_size and \
3393
- self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size:
3394
- continue
3395
- if self._args.strip_blobs_with_ids and \
3396
- change.blob_id in self._args.strip_blobs_with_ids:
3397
- continue
3398
- # Otherwise, record the change
3399
- new_file_changes[change.filename] = change
3400
- commit.file_changes = [v for k,v in sorted(new_file_changes.items())]
3401
-
3402
- def _tweak_commit(self, commit, aux_info):
3403
- # Change the commit message according to callback
3404
- if not self._args.preserve_commit_hashes:
3405
- commit.message = self._hash_re.sub(self._translate_commit_hash,
3406
- commit.message)
3407
- if self._args.replace_message:
3408
- for literal, replacement in self._args.replace_message['literals']:
3409
- commit.message = commit.message.replace(literal, replacement)
3410
- for regex, replacement in self._args.replace_message['regexes']:
3411
- commit.message = regex.sub(replacement, commit.message)
3412
- if self._message_callback:
3413
- commit.message = self._message_callback(commit.message)
3414
-
3415
- # Change the author & committer according to mailmap rules
3416
- args = self._args
3417
- if args.mailmap:
3418
- commit.author_name, commit.author_email = \
3419
- args.mailmap.translate(commit.author_name, commit.author_email)
3420
- commit.committer_name, commit.committer_email = \
3421
- args.mailmap.translate(commit.committer_name, commit.committer_email)
3422
- # Change author & committer according to callbacks
3423
- if self._name_callback:
3424
- commit.author_name = self._name_callback(commit.author_name)
3425
- commit.committer_name = self._name_callback(commit.committer_name)
3426
- if self._email_callback:
3427
- commit.author_email = self._email_callback(commit.author_email)
3428
- commit.committer_email = self._email_callback(commit.committer_email)
3429
-
3430
- # Sometimes the 'branch' given is a tag; if so, rename it as requested so
3431
- # we don't get any old tagnames
3432
- if self._args.tag_rename:
3433
- commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch)
3434
- if self._refname_callback:
3435
- commit.branch = self._refname_callback(commit.branch)
3436
-
3437
- # Filter or rename the list of file changes
3438
- orig_file_changes = set(commit.file_changes)
3439
- self._filter_files(commit)
3440
-
3441
- # Record ancestry graph
3442
- parents, orig_parents = commit.parents, aux_info['orig_parents']
3443
- if self._args.state_branch:
3444
- external_parents = parents
3445
- else:
3446
- external_parents = [p for p in parents if not isinstance(p, int)]
3447
- self._graph.record_external_commits(external_parents)
3448
- self._orig_graph.record_external_commits(external_parents)
3449
- self._graph.add_commit_and_parents(commit.id, parents)
3450
- self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents)
3451
-
3452
- # Prune parents (due to pruning of empty commits) if relevant
3453
- old_1st_parent = parents[0] if parents else None
3454
- parents, new_1st_parent = self._trim_extra_parents(orig_parents, parents)
3455
- commit.parents = parents
3456
-
3457
- # If parents were pruned, then we need our file changes to be relative
3458
- # to the new first parent
3459
- if parents and old_1st_parent != parents[0]:
3460
- commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir,
3461
- ID_TO_HASH[parents[0]],
3462
- commit.original_id)
3463
- orig_file_changes = set(commit.file_changes)
3464
- self._filter_files(commit)
3465
-
3466
- # Find out which files were modified by the callbacks. Such paths could
3467
- # lead to subsequent commits being empty (e.g. if removing a line containing
3468
- # a password from every version of a file that had the password, and some
3469
- # later commit did nothing more than remove that line)
3470
- final_file_changes = set(commit.file_changes)
3471
- if self._args.replace_text or self._blob_callback:
3472
- differences = orig_file_changes.union(final_file_changes)
3473
- else:
3474
- differences = orig_file_changes.symmetric_difference(final_file_changes)
3475
- self._files_tweaked.update(x.filename for x in differences)
3476
-
3477
- # Call the user-defined callback, if any
3478
- if self._commit_callback:
3479
- self._commit_callback(commit, self.callback_metadata(aux_info))
3480
-
3481
- # Now print the resulting commit, or if prunable skip it
3482
- if not commit.dumped:
3483
- if not self._prunable(commit, new_1st_parent,
3484
- aux_info['had_file_changes'], orig_parents):
3485
- self._insert_into_stream(commit)
3486
- self._record_remapping(commit, orig_parents)
3487
- else:
3488
- rewrite_to = new_1st_parent or commit.first_parent()
3489
- commit.skip(new_id = rewrite_to)
3490
- if self._args.state_branch:
3491
- alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash)
3492
- self._insert_into_stream(alias)
3493
- reset = Reset(commit.branch, rewrite_to or deleted_hash)
3494
- self._insert_into_stream(reset)
3495
- self._commit_renames[commit.original_id] = None
3496
-
3497
- # Show progress
3498
- self._num_commits += 1
3499
- if not self._args.quiet:
3500
- self._progress_writer.show(self._parsed_message % self._num_commits)
3501
-
3502
- @staticmethod
3503
- def _do_tag_rename(rename_pair, tagname):
3504
- old, new = rename_pair.split(b':', 1)
3505
- old, new = b'refs/tags/'+old, b'refs/tags/'+new
3506
- if tagname.startswith(old):
3507
- return tagname.replace(old, new, 1)
3508
- return tagname
3509
-
3510
- def _tweak_tag(self, tag):
3511
- # Tweak the tag message according to callbacks
3512
- if self._args.replace_message:
3513
- for literal, replacement in self._args.replace_message['literals']:
3514
- tag.message = tag.message.replace(literal, replacement)
3515
- for regex, replacement in self._args.replace_message['regexes']:
3516
- tag.message = regex.sub(replacement, tag.message)
3517
- if self._message_callback:
3518
- tag.message = self._message_callback(tag.message)
3519
-
3520
- # Tweak the tag name according to tag-name-related callbacks
3521
- tag_prefix = b'refs/tags/'
3522
- fullref = tag_prefix+tag.ref
3523
- if self._args.tag_rename:
3524
- fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref)
3525
- if self._refname_callback:
3526
- fullref = self._refname_callback(fullref)
3527
- if not fullref.startswith(tag_prefix):
3528
- msg = "Error: fast-import requires tags to be in refs/tags/ namespace."
3529
- msg += "\n {} renamed to {}".format(tag_prefix+tag.ref, fullref)
3530
- raise SystemExit(msg)
3531
- tag.ref = fullref[len(tag_prefix):]
3532
-
3533
- # Tweak the tagger according to callbacks
3534
- if self._args.mailmap:
3535
- tag.tagger_name, tag.tagger_email = \
3536
- self._args.mailmap.translate(tag.tagger_name, tag.tagger_email)
3537
- if self._name_callback:
3538
- tag.tagger_name = self._name_callback(tag.tagger_name)
3539
- if self._email_callback:
3540
- tag.tagger_email = self._email_callback(tag.tagger_email)
3541
-
3542
- # Call general purpose tag callback
3543
- if self._tag_callback:
3544
- self._tag_callback(tag, self.callback_metadata())
3545
-
3546
- def _tweak_reset(self, reset):
3547
- if self._args.tag_rename:
3548
- reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref)
3549
- if self._refname_callback:
3550
- reset.ref = self._refname_callback(reset.ref)
3551
- if self._reset_callback:
3552
- self._reset_callback(reset, self.callback_metadata())
3553
-
3554
- def results_tmp_dir(self, create_if_missing=True):
3555
- target_working_dir = self._args.target or b'.'
3556
- git_dir = GitUtils.determine_git_dir(target_working_dir)
3557
- d = os.path.join(git_dir, b'filter-repo')
3558
- if create_if_missing and not os.path.isdir(d):
3559
- os.mkdir(d)
3560
- return d
3561
-
3562
- def _load_marks_file(self, marks_basename):
3563
- full_branch = 'refs/heads/{}'.format(self._args.state_branch)
3564
- marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
3565
- working_dir = self._args.target or b'.'
3566
- cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
3567
- contents = b''
3568
- if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
3569
- cmd = ['git', '-C', working_dir, 'show',
3570
- '%s:%s' % (full_branch, decode(marks_basename))]
3571
- try:
3572
- contents = subproc.check_output(cmd)
3573
- except subprocess.CalledProcessError as e: # pragma: no cover
3574
- raise SystemExit(_("Failed loading %s from %s") %
3575
- (decode(marks_basename), full_branch))
3576
- if contents:
3577
- biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines())
3578
- _IDS._next_id = max(_IDS._next_id, biggest_id+1)
3579
- with open(marks_file, 'bw') as f:
3580
- f.write(contents)
3581
- return marks_file
3582
-
3583
- def _save_marks_files(self):
3584
- basenames = [b'source-marks', b'target-marks']
3585
- working_dir = self._args.target or b'.'
3586
-
3587
- # Check whether the branch exists
3588
- parent = []
3589
- full_branch = 'refs/heads/{}'.format(self._args.state_branch)
3590
- cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
3591
- if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
3592
- parent = ['-p', full_branch]
3593
-
3594
- # Run 'git hash-object $MARKS_FILE' for each marks file, save result
3595
- blob_hashes = {}
3596
- for marks_basename in basenames:
3597
- marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
3598
- if not os.path.isfile(marks_file): # pragma: no cover
3599
- raise SystemExit(_("Failed to find %s to save to %s")
3600
- % (marks_file, self._args.state_branch))
3601
- cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file]
3602
- blob_hashes[marks_basename] = subproc.check_output(cmd).strip()
3603
-
3604
- # Run 'git mktree' to create a tree out of it
3605
- p = subproc.Popen(['git', '-C', working_dir, 'mktree'],
3606
- stdin=subprocess.PIPE, stdout=subprocess.PIPE)
3607
- for b in basenames:
3608
- p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b))
3609
- p.stdin.close()
3610
- p.wait()
3611
- tree = p.stdout.read().strip()
3612
-
3613
- # Create the new commit
3614
- cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files',
3615
- tree] + parent)
3616
- commit = subproc.check_output(cmd).strip()
3617
- subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit])
3618
-
3619
- def importer_only(self):
3620
- self._run_sanity_checks()
3621
- self._setup_output()
3622
-
3623
- def set_output(self, outputRepoFilter):
3624
- assert outputRepoFilter._output
3625
-
3626
- # set_output implies this RepoFilter is doing exporting, though may not
3627
- # be the only one.
3628
- self._setup_input(use_done_feature = False)
3629
-
3630
- # Set our output management up to pipe to outputRepoFilter's locations
3631
- self._managed_output = False
3632
- self._output = outputRepoFilter._output
3633
- self._import_pipes = outputRepoFilter._import_pipes
3634
-
3635
- # Handle sanity checks, though currently none needed for export-only cases
3636
- self._run_sanity_checks()
3637
-
3638
- def _setup_input(self, use_done_feature):
3639
- if self._args.stdin:
3640
- self._input = sys.stdin.detach()
3641
- sys.stdin = None # Make sure no one tries to accidentally use it
3642
- self._fe_orig = None
3643
- else:
3644
- skip_blobs = (self._blob_callback is None and
3645
- self._args.replace_text is None and
3646
- self._args.source == self._args.target)
3647
- extra_flags = []
3648
- if skip_blobs:
3649
- extra_flags.append('--no-data')
3650
- if self._args.max_blob_size:
3651
- self._unpacked_size, packed_size = GitUtils.get_blob_sizes()
3652
- if use_done_feature:
3653
- extra_flags.append('--use-done-feature')
3654
- if write_marks:
3655
- extra_flags.append(b'--mark-tags')
3656
- if self._args.state_branch:
3657
- assert(write_marks)
3658
- source_marks_file = self._load_marks_file(b'source-marks')
3659
- extra_flags.extend([b'--export-marks='+source_marks_file,
3660
- b'--import-marks='+source_marks_file])
3661
- if self._args.preserve_commit_encoding is not None: # pragma: no cover
3662
- reencode = 'no' if self._args.preserve_commit_encoding else 'yes'
3663
- extra_flags.append('--reencode='+reencode)
3664
- location = ['-C', self._args.source] if self._args.source else []
3665
- fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids',
3666
- '--signed-tags=strip', '--tag-of-filtered-object=rewrite',
3667
- '--fake-missing-tagger', '--reference-excluded-parents'
3668
- ] + extra_flags + self._args.refs
3669
- self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE)
3670
- self._input = self._fep.stdout
3671
- if self._args.dry_run or self._args.debug:
3672
- self._fe_orig = os.path.join(self.results_tmp_dir(),
3673
- b'fast-export.original')
3674
- output = open(self._fe_orig, 'bw')
3675
- self._input = InputFileBackup(self._input, output)
3676
- if self._args.debug:
3677
- tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd]
3678
- print("[DEBUG] Running: {}".format(' '.join(tmp)))
3679
- print(" (saving a copy of the output at {})"
3680
- .format(decode(self._fe_orig)))
3681
-
3682
- def _setup_output(self):
3683
- if not self._args.dry_run:
3684
- location = ['-C', self._args.target] if self._args.target else []
3685
- fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false',
3686
- 'fast-import', '--force', '--quiet']
3687
- if date_format_permissive:
3688
- fip_cmd.append('--date-format=raw-permissive')
3689
- if self._args.state_branch:
3690
- target_marks_file = self._load_marks_file(b'target-marks')
3691
- fip_cmd.extend([b'--export-marks='+target_marks_file,
3692
- b'--import-marks='+target_marks_file])
3693
- self._fip = subproc.Popen(fip_cmd, bufsize=-1,
3694
- stdin=subprocess.PIPE, stdout=subprocess.PIPE)
3695
- self._import_pipes = (self._fip.stdin, self._fip.stdout)
3696
- if self._args.dry_run or self._args.debug:
3697
- self._fe_filt = os.path.join(self.results_tmp_dir(),
3698
- b'fast-export.filtered')
3699
- self._output = open(self._fe_filt, 'bw')
3700
- else:
3701
- self._output = self._fip.stdin
3702
- if self._args.debug and not self._args.dry_run:
3703
- self._output = DualFileWriter(self._fip.stdin, self._output)
3704
- tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd]
3705
- print("[DEBUG] Running: {}".format(' '.join(tmp)))
3706
- print(" (using the following file as input: {})"
3707
- .format(decode(self._fe_filt)))
3708
-
3709
- def _migrate_origin_to_heads(self):
3710
- refs_to_migrate = set(x for x in self._orig_refs
3711
- if x.startswith(b'refs/remotes/origin/'))
3712
- if not refs_to_migrate:
3713
- return
3714
- if self._args.debug:
3715
- print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*")
3716
- target_working_dir = self._args.target or b'.'
3717
- p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
3718
- stdin=subprocess.PIPE, cwd=target_working_dir)
3719
- for ref in refs_to_migrate:
3720
- if ref == b'refs/remotes/origin/HEAD':
3721
- p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
3722
- del self._orig_refs[ref]
3723
- continue
3724
- newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/')
3725
- if newref not in self._orig_refs:
3726
- p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref]))
3727
- p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
3728
- self._orig_refs[newref] = self._orig_refs[ref]
3729
- del self._orig_refs[ref]
3730
- p.stdin.close()
3731
- if p.wait():
3732
- raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover
3733
-
3734
- # Now remove
3735
- if self._args.debug:
3736
- print("[DEBUG] Removing 'origin' remote (rewritten history will no ")
3737
- print(" longer be related; consider re-pushing it elsewhere.")
3738
- subproc.call('git remote rm origin'.split(), cwd=target_working_dir)
3739
-
3740
- def _final_commands(self):
3741
- self._finalize_handled = True
3742
- self._done_callback and self._done_callback()
3743
-
3744
- if not self._args.quiet:
3745
- self._progress_writer.finish()
3746
-
3747
- def _ref_update(self, target_working_dir):
3748
- # Start the update-ref process
3749
- p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
3750
- stdin=subprocess.PIPE,
3751
- cwd=target_working_dir)
3752
-
3753
- # Remove replace_refs from _orig_refs
3754
- replace_refs = {k:v for k, v in self._orig_refs.items()
3755
- if k.startswith(b'refs/replace/')}
3756
- reverse_replace_refs = collections.defaultdict(list)
3757
- for k,v in replace_refs.items():
3758
- reverse_replace_refs[v].append(k)
3759
- all(map(self._orig_refs.pop, replace_refs))
3760
-
3761
- # Remove unused refs
3762
- exported_refs, imported_refs = self.get_exported_and_imported_refs()
3763
- refs_to_nuke = exported_refs - imported_refs
3764
- if self._args.partial:
3765
- refs_to_nuke = set()
3766
- if refs_to_nuke and self._args.debug:
3767
- print("[DEBUG] Deleting the following refs:\n "+
3768
- decode(b"\n ".join(refs_to_nuke)))
3769
- p.stdin.write(b''.join([b"delete %s\n" % x
3770
- for x in refs_to_nuke]))
3771
-
3772
- # Delete or update and add replace_refs; note that fast-export automatically
3773
- # handles 'update-no-add', we only need to take action for the other four
3774
- # choices for replace_refs.
3775
- self._flush_renames()
3776
- actual_renames = {k:v for k,v in self._commit_renames.items() if k != v}
3777
- if self._args.replace_refs in ['delete-no-add', 'delete-and-add']:
3778
- # Delete old replace refs, if unwanted
3779
- replace_refs_to_nuke = set(replace_refs)
3780
- if self._args.replace_refs == 'delete-and-add':
3781
- # git-update-ref won't allow us to update a ref twice, so be careful
3782
- # to avoid deleting refs we'll later update
3783
- replace_refs_to_nuke = replace_refs_to_nuke.difference(
3784
- [b'refs/replace/'+x for x in actual_renames])
3785
- p.stdin.write(b''.join([b"delete %s\n" % x
3786
- for x in replace_refs_to_nuke]))
3787
- if self._args.replace_refs in ['delete-and-add', 'update-or-add',
3788
- 'update-and-add']:
3789
- # Add new replace refs
3790
- update_only = (self._args.replace_refs == 'update-or-add')
3791
- p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new)
3792
- for old,new in actual_renames.items()
3793
- if new and not (update_only and
3794
- old in reverse_replace_refs)]))
3795
-
3796
- # Complete the update-ref process
3797
- p.stdin.close()
3798
- if p.wait():
3799
- raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover
3800
-
3801
- def _record_metadata(self, metadata_dir, orig_refs):
3802
- self._flush_renames()
3803
- with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f:
3804
- f.write(("%-40s %s\n" % (_("old"), _("new"))).encode())
3805
- for (old,new) in self._commit_renames.items():
3806
- msg = b'%s %s\n' % (old, new if new != None else deleted_hash)
3807
- f.write(msg)
3808
-
3809
- exported_refs, imported_refs = self.get_exported_and_imported_refs()
3810
-
3811
- batch_check_process = None
3812
- batch_check_output_re = re.compile(b'^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$')
3813
- with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f:
3814
- f.write(("%-40s %-40s %s\n" % (_("old"), _("new"), _("ref"))).encode())
3815
- for refname, old_hash in orig_refs.items():
3816
- if refname not in exported_refs:
3817
- continue
3818
- if refname not in imported_refs:
3819
- new_hash = deleted_hash
3820
- elif old_hash in self._commit_renames:
3821
- new_hash = self._commit_renames[old_hash]
3822
- new_hash = new_hash if new_hash != None else deleted_hash
3823
- else: # Must be either an annotated tag, or a ref whose tip was pruned
3824
- if not batch_check_process:
3825
- cmd = 'git cat-file --batch-check'.split()
3826
- target_working_dir = self._args.target or b'.'
3827
- batch_check_process = subproc.Popen(cmd,
3828
- stdin=subprocess.PIPE,
3829
- stdout=subprocess.PIPE,
3830
- cwd=target_working_dir)
3831
- batch_check_process.stdin.write(refname+b"\n")
3832
- batch_check_process.stdin.flush()
3833
- line = batch_check_process.stdout.readline()
3834
- m = batch_check_output_re.match(line)
3835
- if m and m.group(2) in (b'tag', b'commit'):
3836
- new_hash = m.group(1)
3837
- elif line.endswith(b' missing\n'):
3838
- new_hash = deleted_hash
3839
- else:
3840
- raise SystemExit(_("Failed to find new id for %(refname)s "
3841
- "(old id was %(old_hash)s)")
3842
- % ({'refname': refname, 'old_hash': old_hash})
3843
- ) # pragma: no cover
3844
- f.write(b'%s %s %s\n' % (old_hash, new_hash, refname))
3845
- if self._args.source or self._args.target:
3846
- new_refs = GitUtils.get_refs(self._args.target or b'.')
3847
- for ref, new_hash in new_refs.items():
3848
- if ref not in orig_refs and not ref.startswith(b'refs/replace/'):
3849
- old_hash = b'0'*len(new_hash)
3850
- f.write(b'%s %s %s\n' % (old_hash, new_hash, ref))
3851
- if batch_check_process:
3852
- batch_check_process.stdin.close()
3853
- batch_check_process.wait()
3854
-
3855
- with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f:
3856
- issues_found = False
3857
- if self._commits_no_longer_merges:
3858
- issues_found = True
3859
-
3860
- f.write(textwrap.dedent(_('''
3861
- The following commits used to be merge commits but due to filtering
3862
- are now regular commits; they likely have suboptimal commit messages
3863
- (e.g. "Merge branch next into master"). Original commit hash on the
3864
- left, commit hash after filtering/rewriting on the right:
3865
- ''')[1:]).encode())
3866
- for oldhash, newhash in self._commits_no_longer_merges:
3867
- f.write(' {} {}\n'.format(oldhash, newhash).encode())
3868
- f.write(b'\n')
3869
-
3870
- if self._commits_referenced_but_removed:
3871
- issues_found = True
3872
- f.write(textwrap.dedent(_('''
3873
- The following commits were filtered out, but referenced in another
3874
- commit message. The reference to the now-nonexistent commit hash
3875
- (or a substring thereof) was left as-is in any commit messages:
3876
- ''')[1:]).encode())
3877
- for bad_commit_reference in self._commits_referenced_but_removed:
3878
- f.write(' {}\n'.format(bad_commit_reference).encode())
3879
- f.write(b'\n')
3880
-
3881
- if not issues_found:
3882
- f.write(_("No filtering problems encountered.\n").encode())
3883
-
3884
- with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f:
3885
- f.write(_("This file exists to allow you to filter again without --force.\n").encode())
3886
-
3887
- def finish(self):
3888
- ''' Alternative to run() when there is no input of our own to parse,
3889
- meaning that run only really needs to close the handle to fast-import
3890
- and let it finish, thus making a call to "run" feel like a misnomer. '''
3891
- assert not self._input
3892
- assert self._managed_output
3893
- self.run()
3894
-
3895
- def insert(self, obj, direct_insertion = False):
3896
- if not direct_insertion:
3897
- if type(obj) == Blob:
3898
- self._tweak_blob(obj)
3899
- elif type(obj) == Commit:
3900
- aux_info = {'orig_parents': obj.parents,
3901
- 'had_file_changes': bool(obj.file_changes)}
3902
- self._tweak_commit(obj, aux_info)
3903
- elif type(obj) == Reset:
3904
- self._tweak_reset(obj)
3905
- elif type(obj) == Tag:
3906
- self._tweak_tag(obj)
3907
- self._insert_into_stream(obj)
3908
-
3909
- def _insert_into_stream(self, obj):
3910
- if not obj.dumped:
3911
- if self._parser:
3912
- self._parser.insert(obj)
3913
- else:
3914
- obj.dump(self._output)
3915
-
3916
- def get_exported_and_imported_refs(self):
3917
- return self._parser.get_exported_and_imported_refs()
3918
-
3919
- def run(self):
3920
- start = time.time()
3921
- if not self._input and not self._output:
3922
- self._run_sanity_checks()
3923
- if not self._args.dry_run and not self._args.partial:
3924
- self._migrate_origin_to_heads()
3925
- self._setup_input(use_done_feature = True)
3926
- self._setup_output()
3927
- assert self._sanity_checks_handled
3928
-
3929
- if self._input:
3930
- # Create and run the filter
3931
- self._repo_working_dir = self._args.source or b'.'
3932
- self._parser = FastExportParser(blob_callback = self._tweak_blob,
3933
- commit_callback = self._tweak_commit,
3934
- tag_callback = self._tweak_tag,
3935
- reset_callback = self._tweak_reset,
3936
- done_callback = self._final_commands)
3937
- self._parser.run(self._input, self._output)
3938
- if not self._finalize_handled:
3939
- self._final_commands()
3940
-
3941
- # Make sure fast-export completed successfully
3942
- if not self._args.stdin and self._fep.wait():
3943
- raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover
3944
- self._input.close()
3945
-
3946
- # If we're not the manager of self._output, we should avoid post-run cleanup
3947
- if not self._managed_output:
3948
- return
3949
-
3950
- # Close the output and ensure fast-import successfully completes
3951
- self._output.close()
3952
- if not self._args.dry_run and self._fip.wait():
3953
- raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover
3954
-
3955
- # With fast-export and fast-import complete, update state if requested
3956
- if self._args.state_branch:
3957
- self._save_marks_files()
3958
-
3959
- # Notify user how long it took, before doing a gc and such
3960
- msg = "New history written in {:.2f} seconds..."
3961
- if self._args.repack:
3962
- msg = "New history written in {:.2f} seconds; now repacking/cleaning..."
3963
- print(msg.format(time.time()-start))
3964
-
3965
- # Exit early, if requested
3966
- if self._args.dry_run:
3967
- print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed."))
3968
- if self._fe_orig:
3969
- print(_(" Requested filtering can be seen by comparing:"))
3970
- print(" " + decode(self._fe_orig))
3971
- else:
3972
- print(_(" Requested filtering can be seen at:"))
3973
- print(" " + decode(self._fe_filt))
3974
- return
3975
-
3976
- target_working_dir = self._args.target or b'.'
3977
- if self._input:
3978
- self._ref_update(target_working_dir)
3979
-
3980
- # Write out data about run
3981
- self._record_metadata(self.results_tmp_dir(), self._orig_refs)
3982
-
3983
- # Final cleanup:
3984
- # If we need a repack, then nuke the reflogs and repack.
3985
- # If we need a reset, do a reset --hard
3986
- reset = not GitUtils.is_repository_bare(target_working_dir)
3987
- RepoFilter.cleanup(target_working_dir, self._args.repack, reset,
3988
- run_quietly=self._args.quiet,
3989
- show_debuginfo=self._args.debug)
3990
-
3991
- # Let user know how long it took
3992
- print(_("Completely finished after {:.2f} seconds.")
3993
- .format(time.time()-start))
3994
-
3995
- def main():
3996
- setup_gettext()
3997
- args = FilteringOptions.parse_args(sys.argv[1:])
3998
- if args.analyze:
3999
- RepoAnalyze.run(args)
4000
- else:
4001
- filter = RepoFilter(args)
4002
- filter.run()
4003
-
4004
- if __name__ == '__main__':
4005
- main()