repominer-GDeLuisi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: repominer-GDeLuisi
3
+ Version: 0.1.0
4
+ Summary: Tool for mining git managed repositories
5
+ Author: Gerardo De Luisi
6
+ License-File: LICENSE
7
+ Keywords: git,miner,projects
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Python: >=3.10
15
+ Description-Content-Type: text/markdown
16
+
17
+ # repository_miner
18
+ A python library used for mining git repositories
19
+ git >= 2.40 is needed
@@ -0,0 +1,12 @@
1
+ src/repository_miner/__init__.py,sha256=sYtS1nL_0XlvcmJrIxE7TQwfcO1f2HDcs1rvFWt-aww,115
2
+ src/repository_miner/data_typing.py,sha256=9mZcFk0s-7CrrGcvKxUqNiORkNk8x_sn0Vjc8PxwRqw,2964
3
+ src/repository_miner/exceptions.py,sha256=xFKI_ra-6FW_Ux3vF-xVM2XRFowAX0FRXzFRLTuTZUg,411
4
+ src/repository_miner/git.py,sha256=UzbDWLbRGxcwMT8tyPSn8U0UOGoHgJvDWV0-BcBnmR8,1293
5
+ src/repository_miner/helper.py,sha256=dLVoA-xW1kQoT7VPYy4kyj1FDmkOdElZJPs_S3Ev51c,5829
6
+ src/repository_miner/miner.py,sha256=GTGJHL-3KF9VamFnlPy0RQvaCyiijxsDlzb-fJZ9I4Y,5035
7
+ src/repository_miner/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ src/repository_miner/utility.py,sha256=y6rgmmitk4UuZI8AthuNa716UfJSjOL9kZaJ-d7Zc8I,1275
9
+ repominer_gdeluisi-0.1.0.dist-info/METADATA,sha256=zLcpyckfl5WolpDm0e3ccyfj2ELAgy-Pk1NTl4FQjcU,646
10
+ repominer_gdeluisi-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ repominer_gdeluisi-0.1.0.dist-info/licenses/LICENSE,sha256=rPNAmE1lTJ0R9WVW3rDw85fHnHGXR_AbHj_aSI7njO8,1064
12
+ repominer_gdeluisi-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 GeggeDL
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ from .utility import *
2
+ from .data_typing import *
3
+ from .git import Git
4
+ from .helper import *
5
+ from .miner import *
6
+
@@ -0,0 +1,106 @@
1
+ from dataclasses import dataclass,field
2
+ from datetime import datetime
3
+ from time import strptime
4
+ from typing import Literal,get_args,Iterable,Optional,Union,Generator,Callable
5
+ import json
6
+ from pathlib import Path
7
+ from repository_miner.utility import Call
8
+
9
+ @dataclass
10
+ class Author():
11
+ email:str
12
+ name:str
13
+ commits_authored:list[str]=field(default_factory=lambda: [])
14
+ def __hash__(self):
15
+ return hash(repr(self.name)+repr(self.email))
16
+ def __eq__(self, value):
17
+ if not isinstance(value,Author):
18
+ raise TypeError(f"Expected value of type <Author>, received {type(value)}")
19
+ return self.name==value.name and self.email==value.email
20
+ def __str__(self):
21
+ return f"Name: {self.name} , Email: {self.email}"
22
+ def __repr__(self):
23
+ return f"Name: {self.name} , Email: {self.email} , Commits: {self.commits_authored}"
24
+
25
+ @dataclass
26
+ class CommitInfo():
27
+ commit_hash:str
28
+ abbr_hash:str
29
+ tree:str
30
+ refs:str
31
+ subject:str
32
+ author_name:str
33
+ author_email:str
34
+ date:datetime
35
+ def __hash__(self):
36
+ return hash(self.commit_hash)
37
+ def get_tree(self)->'Tree':
38
+ raise NotImplementedError()
39
+
40
+ @dataclass
41
+ class Head():
42
+ name:str
43
+ hash:str
44
+ def __hash__(self):
45
+ return hash(self.hash)
46
+ def traverse_commits(self)->Generator[CommitInfo,None,None]:
47
+ raise NotImplementedError()
48
+
49
+ class HeadImpl(Head):
50
+ def __init__(self,name:str,hash:str,retrieve_func:Call):
51
+ super().__init__(name,hash)
52
+ self.retrieve_func=retrieve_func
53
+ def traverse_commits(self):
54
+ return self.retrieve_func()
55
+
56
+
57
+ @dataclass
58
+ class Blob():
59
+ hash:str
60
+ name:str
61
+ path:str
62
+ size:int
63
+ def __hash__(self):
64
+ return hash(self.hash)
65
+ def get_source(self)->list[str]:
66
+ raise NotImplementedError()
67
+
68
+ @dataclass
69
+ class Tree():
70
+ hash:str
71
+ path:str
72
+ def traverse(self)->Generator[Union['Tree',Blob],None,None]:
73
+ raise NotImplementedError()
74
+ def __hash__(self):
75
+ return hash(self.hash)
76
+
77
+ class TreeImpl(Tree):
78
+ def __init__(self,hash:str,path:str,iter_function:Call):
79
+ super().__init__(hash,path)
80
+ self.iter_func=iter_function
81
+ def traverse(self)->Generator[Union[Tree,Blob],None,None]:
82
+ return self.iter_func()
83
+
84
+ class CommitInfoImpl(CommitInfo):
85
+ def __init__(self
86
+ ,commit_hash:str
87
+ ,abbr_hash:str
88
+ ,tree:str
89
+ ,refs:str
90
+ ,subject:str
91
+ ,author_name:str
92
+ ,author_email:str
93
+ ,date:datetime
94
+ ,tree_func:Call):
95
+ super().__init__(commit_hash,abbr_hash,tree,refs,subject,author_name,author_email,date)
96
+ self.tree_func=tree_func
97
+
98
+ def get_tree(self)->Tree:
99
+ return self.tree_func()
100
+
101
+ class BlobImpl(Blob):
102
+ def __init__(self,hash:str,name:str,path:str,size:int,source_func:Call):
103
+ super().__init__(hash,name,path,size)
104
+ self.source_func=source_func
105
+ def get_source(self):
106
+ return self.source_func()
@@ -0,0 +1,15 @@
1
+ class GitNotFoundException(Exception):
2
+ def __init__(self, *args):
3
+ super().__init__(*args)
4
+
5
+ class GitCmdError(Exception):
6
+ def __init__(self, *args):
7
+ super().__init__(*args)
8
+
9
+ class NotGitRepositoryError(Exception):
10
+ def __init__(self, *args):
11
+ super().__init__(*args)
12
+
13
+ class ParsingException(Exception):
14
+ def __init__(self, *args):
15
+ super().__init__(*args)
@@ -0,0 +1,31 @@
1
+ from .utility import execute_command
2
+ from .helper import cmd_builder,log_builder,rev_list_builder,get_head_commit,is_dir_a_repo,is_git_available
3
+ from .exceptions import *
4
+ from functools import partial
5
+ from typing import Iterable,Optional
6
+ from datetime import datetime
7
+ from subprocess import CalledProcessError
8
+ class Git():
9
+ def __init__(self,path:str):
10
+ if not is_git_available():
11
+ raise GitNotFoundException("Git not found")
12
+ if not is_dir_a_repo(path):
13
+ raise NotGitRepositoryError(f"Directory {path} is not a git repository")
14
+ self.path=path
15
+
16
+ def _execute_command(self,command:str,*args)->str:
17
+ cmd=""
18
+ try:
19
+ if len(args)==1 and not isinstance(args[0],str) and isinstance(args[0],Iterable):
20
+ cmd=cmd_builder(command,self.path,*args[0])
21
+ else:
22
+ cmd=cmd_builder(command,self.path,*args)
23
+ return execute_command(cmd)
24
+ except CalledProcessError as e:
25
+ raise GitCmdError(f"Command {cmd} raised an error {e.stderr}")
26
+
27
+ def __getattr__(self, name:str):
28
+ if name in self.__dict__ or name in self.__class__.__dict__:
29
+ return getattr(self,name)
30
+ name=name.replace("_","-")
31
+ return partial(self._execute_command,name)
@@ -0,0 +1,156 @@
1
+ from pathlib import Path
2
+ from shutil import which
3
+ import sys
4
+ import os
5
+ import subprocess
6
+ from typing import Optional
7
+ from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
8
+ from math import floor,ceil
9
+ from functools import partial
10
+ from typing import Iterable
11
+ import json
12
+ from datetime import datetime
13
+ # max_worker = min(32,os.cpu_count())
14
+
15
+ def date_builder(since:Optional[datetime]=None,to:Optional[datetime]=None)->list[str]:
16
+ args=[]
17
+ if to and since and to<since:
18
+ raise ValueError("'to' cannot come before 'since'")
19
+ if since:
20
+ d_str=since.strftime(r"%Y-%m-%d")
21
+ args.append(f"--since='{d_str}'")
22
+ if to:
23
+ d_str=to.strftime(r"%Y-%m-%d")
24
+ args.append(f"--until='{d_str}'")
25
+ return args
26
+
27
+ def cmd_builder(command:str,repo:str,*args)->str:
28
+ """Base git command generator
29
+
30
+ Args:
31
+ command (str): command to use
32
+ repo (str): git directory to execute the command on
33
+
34
+ Returns:
35
+ str: The complete command as a string
36
+ """
37
+
38
+ arg_string=f"git -C {repo} {command}"
39
+ arg_string=arg_string + " "+ " ".join(args)
40
+ return arg_string
41
+
42
+ def range_builder(from_commmit:str,to_commit:Optional[str]=None)->str:
43
+ if not from_commmit:
44
+ raise ValueError("'from_commit' parameter must always be valorized")
45
+ if to_commit:
46
+ return f"{to_commit}..{from_commmit}"
47
+ else:
48
+ return from_commmit
49
+
50
+ def log_builder(from_commit:str,to_commit:Optional[str]=None,pretty:Optional[str]=None,merges:bool=False,max_count:Optional[int]=None,skip:Optional[int]=None,author:Optional[str]=None,follow:Optional[str]=None,since:Optional[datetime]=None,to:Optional[datetime]=None,args=[])->str:
51
+ """Builds the complete command string for a log command
52
+
53
+ Args:
54
+ repo (str): Git repository to execute the command on
55
+ commit (str): The commit from which start the logging operation
56
+ pretty (Optional[str], optional): The format used by --pretty. Defaults to None.
57
+ merges (bool, optional): Specifies whether load merge commits. Defaults to False.
58
+ max_count (Optional[int], optional): Paramenter for --max-count flag. Defaults to None.
59
+ skip (Optional[int], optional): Parameter for --skip flag. Defaults to None.
60
+ author (Optional[str], optional): Filter only commits coming authored by the passed author. Defaults to None.
61
+ follow (Optional[str], optional): Filter only commits which changed the passed file. Defaults to None.
62
+
63
+ Returns:
64
+ str: Returns the git command string
65
+ """
66
+ arg_list=[range_builder(from_commit,to_commit)]
67
+ if max_count!=None:
68
+ if max_count<=0:
69
+ raise ValueError("max_count cannot be negative or 0")
70
+ arg_list.append(f"--max-count={max_count}")
71
+ if skip!=None:
72
+ if skip<0:
73
+ raise ValueError("skip cannot be negative")
74
+ arg_list.append(f"--skip={skip}")
75
+ if merges:
76
+ arg_list.append("--no-merges")
77
+ if pretty!=None:
78
+ arg_list.append(f'--pretty="format:{pretty}"')
79
+ if author:
80
+ arg_list.append(f'--author="{author}"')
81
+ arg_list.extend(date_builder(since,to))
82
+ arg_list.extend(args)
83
+ if follow:
84
+ arg_list.append(f'--follow -- "{follow}"')
85
+ return " ".join(arg_list)
86
+
87
+ def rev_list_builder(from_commit:str,to_commit:Optional[str]=None,pretty:Optional[str]=None,merges:bool=False,max_count:Optional[int]=None,skip:Optional[int]=None,author:Optional[str]=None,since:Optional[datetime]=None,to:Optional[datetime]=None,args=[])->str:
88
+ """Builds the complete command string for a log command
89
+
90
+ Args:
91
+ repo (str): Git repository to execute the command on
92
+ commit (str): The commit from which start the logging operation
93
+ pretty (Optional[str], optional): The format used by --pretty. Defaults to None.
94
+ merges (bool, optional): Specifies whether load merge commits. Defaults to False.
95
+ max_count (Optional[int], optional): Paramenter for --max-count flag. Defaults to None.
96
+ skip (Optional[int], optional): Parameter for --skip flag. Defaults to None.
97
+ author (Optional[str], optional): Filter only commits coming authored by the passed author. Defaults to None.
98
+
99
+ Returns:
100
+ str: Returns the git command string
101
+ """
102
+ arg_list=[range_builder(from_commit,to_commit)]
103
+ if max_count!=None:
104
+ if max_count<=0:
105
+ raise ValueError("max_count cannot be negative or 0")
106
+ arg_list.append(f"--max-count={max_count}")
107
+ if skip!=None:
108
+ if skip<0:
109
+ raise ValueError("skip cannot be negative")
110
+ arg_list.append(f"--skip={skip}")
111
+ if merges:
112
+ arg_list.append("--no-merges")
113
+ if pretty!=None:
114
+ arg_list.append(f'--pretty="format:{pretty}"')
115
+ if author!=None:
116
+ arg_list.append(f'--author="{author}"')
117
+ arg_list.extend(date_builder(since,to))
118
+ arg_list.extend(args)
119
+ return " ".join(arg_list)
120
+
121
+ def is_git_available()->bool:
122
+ """Checks whether git is on PATH
123
+
124
+ Returns:
125
+ bool: If git is on PATH
126
+ """
127
+ return which("git")!=None
128
+
129
+ def is_dir_a_repo(path:str)->bool:
130
+ """Checks whether the path points to a git directory
131
+
132
+ Args:
133
+ path (str): path to repo dir
134
+
135
+ Returns:
136
+ bool: Returns wheter the directory is a repo
137
+ """
138
+ cmd = f"git -C {Path(path).resolve().as_posix()} rev-parse HEAD"
139
+ try:
140
+ subprocess.check_call(cmd,shell=True)
141
+ return True
142
+ except subprocess.CalledProcessError:
143
+ return False
144
+
145
+ def get_head_commit(path:str)->str:
146
+ """Return head commit
147
+
148
+ Args:
149
+ path (str): path to git directory
150
+
151
+ Returns:
152
+ str: Returns HEAD's commit sha
153
+ """
154
+ cmd = f"git -C {Path(path).resolve().as_posix()} rev-parse HEAD"
155
+ return subprocess.check_output(cmd,shell=True).decode()[:-1]
156
+
@@ -0,0 +1,98 @@
1
+ from .git import Git
2
+ from .utility import Call
3
+ from .helper import cmd_builder,log_builder,rev_list_builder,get_head_commit,is_dir_a_repo,is_git_available
4
+ from .exceptions import *
5
+ from .data_typing import *
6
+ from functools import partial
7
+ from typing import Iterable,Optional,Generator
8
+ from datetime import datetime
9
+ import re
10
+ class RepoMiner():
11
+ def __init__(self,path:str):
12
+ self.git=Git(path)
13
+ self.path=path
14
+
15
+ def retrieve_commits(self,from_commit:Optional[str]=None,to_commit:Optional[str]=None,merges:bool=False,max_count:Optional[int]=None,skip:Optional[int]=None,author:Optional[str]=None,follow:Optional[str]=None,since:Optional[datetime]=None,to:Optional[datetime]=None,extra_args:Optional[Iterable[str]]=[])->Generator[CommitInfo,None,None]:
16
+ if not from_commit:
17
+ from_commit=get_head_commit(self.path)
18
+ pretty=r"%H///%T///%s///%an///%ae///%as///%D"
19
+ logs=self.git.log(log_builder(from_commit,to_commit,pretty,merges,max_count,skip,author,follow,since,to,extra_args))
20
+ for log in logs.splitlines(False):
21
+ try:
22
+ c_hash,tree,sub,a_name,a_email,c_date,ref=log.split(r"///")
23
+ yield CommitInfoImpl(c_hash,c_hash[:7],tree,ref,sub,a_name,a_email,datetime.strptime(c_date,r"%Y-%m-%d"),Call(self.tree,tree))
24
+ except ValueError as e:
25
+ raise ParsingException(f"Log {log} was not parsed")
26
+
27
+ def n_commits(self,from_commit:Optional[str]=None,to_commit:Optional[str]=None,merges:bool=True,skip:Optional[int]=None,author:Optional[str]=None,since:Optional[datetime]=None,to:Optional[datetime]=None)->int:
28
+ if not from_commit:
29
+ from_commit=get_head_commit(self.path)
30
+ return int(self.git.rev_list(rev_list_builder(from_commit=from_commit,to_commit=to_commit,merges=merges,max_count=None,skip=skip,author=author,since=since,to=to,args=["--count"])))
31
+
32
+ def tree(self,treeish:str)->Tree:
33
+ try:
34
+ t=self.git.cat_file(["-t",treeish])
35
+ if t == "blob" and t == "tag":
36
+ raise GitCmdError()
37
+ return TreeImpl(treeish,"",Call(self.iterate_tree,treeish,True))
38
+ except GitCmdError as e:
39
+ raise ValueError(f"Cannot retrieve a tree from {treeish}")
40
+
41
+ def iterate_tree(self,treeish:str,recursive:bool=False)->Generator[Union[Tree,Blob],None,None]:
42
+ p_format="--format=\"%(objectname)///%(objecttype)///%(objectsize)///%(path)\""
43
+ args=[p_format]
44
+ if recursive:
45
+ args.append("-r")
46
+ args.append("-t")
47
+ args.append(treeish)
48
+ try:
49
+ res=self.git.ls_tree(args)
50
+ for line in res.splitlines(False):
51
+ h,t,size,path=line.split('///')
52
+ if t == "tree":
53
+ yield TreeImpl(h,path,Call(self.iterate_tree,treeish=treeish,recursive=True))
54
+ elif t == "blob":
55
+ size=int(size)
56
+ yield BlobImpl(h,path.rsplit("/",1)[-1],path,size,Call(self.get_source,h))
57
+ except GitCmdError as e:
58
+ raise ValueError(f"Cannot retrieve a tree from {treeish}")
59
+ except ValueError as e:
60
+ raise ParsingException(f"Unable to parse tree line {line}")
61
+
62
+ def get_commit(self,commit_sha:str)->CommitInfo:
63
+ pretty=r"%H///%T///%s///%an///%ae///%as///%D"
64
+ log=self.git.log(log_builder(commit_sha,None,pretty,max_count=1))
65
+ c_hash,tree,sub,a_name,a_email,c_date,ref=log.split(r"///")
66
+ return CommitInfoImpl(c_hash,c_hash[:7],tree,ref,sub,a_name,a_email,datetime.strptime(c_date,r"%Y-%m-%d"),Call(self.tree,tree))
67
+
68
+ def local_branches(self)->Generator[Head,None,None]:
69
+ branches=self.git.branch("-l").splitlines()
70
+ for branch in branches:
71
+ name=branch.strip("*").strip()
72
+ yield HeadImpl(name,self.git.rev_parse(name),Call(self.retrieve_commits,from_commit=name,merges=True))
73
+
74
+ def authors(self)->set[Author]:
75
+ pattern=re.compile(r'([A-Za-zÀ-ÖØ-öø-ÿé\s]+) <([a-z0-9A-ZÀ-ÖØ-öø-ÿé!#$%@.&*+\/=?^_{|}~-]+)> \(\d+\)')
76
+ authors=set()
77
+ res=self.git.shortlog(["-e","--all","--pretty='format:%H'"])
78
+ res=res.split("\n\n")[:-1]
79
+ for a_block in res:
80
+ tmp=a_block.split("\n")
81
+ author=tmp.pop(0).strip()
82
+ match=re.match(pattern=pattern,string=author)
83
+ if not match:
84
+ continue
85
+ name,email=match.groups()
86
+ author = Author(name,email,[])
87
+ for line in tmp:
88
+ author.commits_authored.append(line.strip())
89
+ authors.add(author)
90
+ return authors
91
+
92
+ def get_source(self, id:str)->list[str]:
93
+ try:
94
+ if self.git.cat_file("-t",id) != "blob":
95
+ raise TypeError(f"Hexsha {id} in not a blob")
96
+ except GitCmdError:
97
+ raise FileNotFoundError("Couldn't retrieve the object")
98
+ return re.split(string=self.git.cat_file("-p",id),pattern=r"\r\n|\r|\n")
File without changes
@@ -0,0 +1,40 @@
1
+ import subprocess
2
+ from math import floor,ceil
3
+ from typing import Iterable,Callable,Any
4
+
5
+ class Call():
6
+ def __init__(self,func:Callable[...,Any],*args,**kwargs):
7
+ self.func = func
8
+ self.args=args
9
+ self.kwargs=kwargs
10
+ def __call__(self, *args, **kwds):
11
+ return self.func(*self.args,**self.kwargs)
12
+
13
+ def execute_command(command:str)->str:
14
+ return subprocess.check_output(command,shell=True,text=True,encoding="utf-8").strip()
15
+
16
+ def create_batches(it:Iterable,n:int)->Iterable[Iterable]:
17
+ """create batches of n items for batch using the items in the iterable
18
+
19
+ Args:
20
+ it (Iterable): iterable from which batches are created
21
+ n (int): number of items for each batch
22
+
23
+ Raises:
24
+ ValueError: If iterable is empty or None and if the number of items for batch is not correct
25
+
26
+ Returns:
27
+ Iterable[Iterable]: Iterable containing the batches
28
+ """
29
+ if not n:
30
+ raise ValueError("n must be at least 1")
31
+ if not it:
32
+ raise ValueError("Iterable cannot be None or empty")
33
+ batches=[]
34
+ tmp=list(it)
35
+ n_items=len(tmp)
36
+ if n_items==0:
37
+ raise ValueError("Iterable must not be empty")
38
+ for i in range(0,n_items,n):
39
+ batches.append(tmp[i:i+n])
40
+ return tuple(batches)