paper2torch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: paper2torch
3
+ Version: 0.1.0
4
+ Summary: Convert research papers to PyTorch code using LLMs
5
+ Home-page: https://github.com/karmagodjs/paper2torch
6
+ Author: Dhruv Kumar
7
+ Author-email: rafftarsingh7982@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: groq
15
+ Requires-Dist: pymupdf
16
+ Requires-Dist: click
17
+ Requires-Dist: rich
18
+ Requires-Dist: python-dotenv
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description-content-type
23
+ Dynamic: home-page
24
+ Dynamic: requires-dist
25
+ Dynamic: requires-python
26
+ Dynamic: summary
File without changes
@@ -0,0 +1,3 @@
1
+ __version__ = "0.1.0"
2
+ __author__ = "Dhruv Kumar"
3
+ __description__ = "Convert research papers to PyTorch code"
@@ -0,0 +1,104 @@
1
+ import click
2
+ import os
3
+ import json
4
+ from rich.console import Console
5
+ from rich.panel import Panel
6
+ from rich.progress import Progress, SpinnerColumn, TextColumn
7
+
8
+ # naya
9
+ from paper2torch.parser import extract_text_from_pdf, extract_relevant_sections
10
+ from paper2torch.extractor import extract_sections, get_core_content
11
+ from paper2torch.generator import generate_pytorch_code
12
+ from paper2torch.validator import validate_code
13
+
14
+ console = Console()
15
+
16
+
17
+ @click.command()
18
+ @click.argument('pdf_path')
19
+ @click.option('--output', '-o', default='./output', help='Output folder path')
20
+ @click.option('--title', '-t', default='Unknown Paper', help='Paper title')
21
+ def main(pdf_path, output, title):
22
+ """
23
+ paper2torch — Convert research papers to PyTorch code.
24
+
25
+ Usage: python cli.py paper.pdf --output ./generated
26
+ """
27
+
28
+ console.print(Panel.fit(
29
+ "[bold red]paper2torch[/bold red] — Research Paper to PyTorch",
30
+ border_style="red"
31
+ ))
32
+
33
+ # Step 1: PDF parse karo
34
+ with Progress(
35
+ SpinnerColumn(),
36
+ TextColumn("[progress.description]{task.description}"),
37
+ transient=True
38
+ ) as progress:
39
+
40
+ task = progress.add_task("Parsing PDF...", total=None)
41
+
42
+ try:
43
+ pdf_data = extract_text_from_pdf(pdf_path)
44
+ console.print(f"[green]✓[/green] PDF parsed — {pdf_data['total_pages']} pages found")
45
+ except FileNotFoundError:
46
+ console.print(f"[red]✗ PDF not found: {pdf_path}[/red]")
47
+ return
48
+
49
+ # Step 2: Sections extract karo
50
+ progress.update(task, description="Extracting sections...")
51
+ sections = extract_sections(pdf_data['full_text'])
52
+ core_content = get_core_content(sections)
53
+ console.print(f"[green]✓[/green] Sections extracted — {len(core_content)} chars")
54
+
55
+ # Step 3: PyTorch code generate karo
56
+ progress.update(task, description="Generating PyTorch code with Gemini...")
57
+ result = generate_pytorch_code(core_content, title)
58
+ console.print(f"[green]✓[/green] PyTorch code generated")
59
+
60
+ # Step 4: Validate karo
61
+ progress.update(task, description="Validating code...")
62
+ validation = validate_code(result['model_code'])
63
+
64
+ if validation['passed']:
65
+ console.print(f"[green]✓[/green] Validation passed")
66
+ else:
67
+ console.print(f"[yellow]⚠[/yellow] Validation: {validation['summary']}")
68
+
69
+ # Step 5: Output save karo
70
+ os.makedirs(output, exist_ok=True)
71
+
72
+ # model.py save karo
73
+ model_path = os.path.join(output, 'model.py')
74
+ with open(model_path, 'w', encoding='utf-8') as f:
75
+ f.write(result['model_code'])
76
+
77
+ # config.py save karo
78
+ config_path = os.path.join(output, 'config.py')
79
+ with open(config_path, 'w', encoding='utf-8') as f:
80
+ f.write(result['config_code'])
81
+
82
+ # README save karo
83
+ readme_path = os.path.join(output, 'README_generated.md')
84
+ with open(readme_path, 'w', encoding='utf-8') as f:
85
+ f.write(f"# {title}\n\n")
86
+ f.write("## Generated by paper2torch\n\n")
87
+ f.write("### Architecture Info\n\n")
88
+ f.write(f"```json\n{result['architecture_info']}\n```\n\n")
89
+ f.write("### Validation\n\n")
90
+ f.write(f"- Status: {'✓ Passed' if validation['passed'] else '⚠ Issues found'}\n")
91
+ f.write(f"- Summary: {validation['summary']}\n")
92
+
93
+ console.print(Panel.fit(
94
+ f"[bold green]Done![/bold green]\n\n"
95
+ f"Files saved to [cyan]{output}[/cyan]\n"
96
+ f" • model.py\n"
97
+ f" • config.py\n"
98
+ f" • README_generated.md",
99
+ border_style="green"
100
+ ))
101
+
102
+
103
+ if __name__ == "__main__":
104
+ main()
@@ -0,0 +1,54 @@
1
+ import re
2
+
3
+ def extract_sections(full_text: str) -> dict:
4
+ sections = {
5
+ "abstract": "",
6
+ "introduction": "",
7
+ "methodology": "",
8
+ "architecture": "",
9
+ "experiments": "",
10
+ "conclusion": ""
11
+ }
12
+
13
+ section_patterns = {
14
+ "abstract": r"abstract",
15
+ "introduction": r"introduction",
16
+ "methodology": r"(method|methodology|approach|proposed method)",
17
+ "architecture": r"(architecture|model architecture|network architecture)",
18
+ "experiments": r"(experiment|evaluation|results)",
19
+ "conclusion": r"(conclusion|conclusions)"
20
+ }
21
+
22
+ lines = full_text.split('\n')
23
+ current_section = None
24
+
25
+ for line in lines:
26
+ line_lower = line.lower().strip()
27
+
28
+ # Check if line is a section header
29
+ if len(line.strip()) < 60:
30
+ for section, pattern in section_patterns.items():
31
+ if re.search(pattern, line_lower):
32
+ current_section = section
33
+ break
34
+
35
+ if current_section:
36
+ sections[current_section] += line + "\n"
37
+
38
+ return sections
39
+
40
+
41
+ def get_core_content(sections: dict) -> str:
42
+ """
43
+ LLM ko feed karne ke liye core content nikalta hai
44
+ """
45
+ priority = ["abstract", "methodology", "architecture", "introduction"]
46
+
47
+ core = ""
48
+ for section in priority:
49
+ if sections.get(section):
50
+ core += f"\n=== {section.upper()} ===\n"
51
+ core += sections[section]
52
+
53
+ # Limit to 8000 chars — Groq/Gemini context ke liye
54
+ return core[:8000]
@@ -0,0 +1,94 @@
1
+ import os
2
+ from groq import Groq
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
8
+
9
+
10
+ def generate_pytorch_code(core_content: str, paper_title: str = "Unknown") -> dict:
11
+
12
+ # Step 1: Architecture extract karo
13
+ arch_prompt = f"""
14
+ You are an expert ML engineer. Read this research paper content and extract the model architecture.
15
+
16
+ Paper content:
17
+ {core_content}
18
+
19
+ Return ONLY a JSON with these fields:
20
+ {{
21
+ "model_name": "name of the model",
22
+ "architecture_type": "transformer/cnn/rnn/mlp/other",
23
+ "key_components": ["list", "of", "components"],
24
+ "hyperparameters": {{"param_name": "value"}},
25
+ "input_format": "description of input",
26
+ "output_format": "description of output"
27
+ }}
28
+ Return only JSON, no explanation.
29
+ """
30
+
31
+ arch_response = client.chat.completions.create(
32
+ model="llama-3.3-70b-versatile",
33
+ messages=[{"role": "user", "content": arch_prompt}],
34
+ max_tokens=1000
35
+ )
36
+ arch_text = arch_response.choices[0].message.content.strip()
37
+
38
+ # Step 2: PyTorch code generate karo
39
+ code_prompt = f"""
40
+ You are an expert PyTorch developer. Based on this research paper content, write a complete PyTorch implementation.
41
+
42
+ Paper content:
43
+ {core_content}
44
+
45
+ Rules:
46
+ 1. Write a complete nn.Module class
47
+ 2. Include __init__ and forward methods
48
+ 3. Add clear comments explaining each component
49
+ 4. Include a config dataclass at the top
50
+ 5. Add a simple test at the bottom inside if __name__ == "__main__"
51
+ 6. Use only standard PyTorch — no external libraries
52
+ 7. Make it runnable as-is
53
+
54
+ Return ONLY Python code, no explanation, no markdown backticks.
55
+ """
56
+
57
+ code_response = client.chat.completions.create(
58
+ model="llama-3.3-70b-versatile",
59
+ messages=[{"role": "user", "content": code_prompt}],
60
+ max_tokens=4000
61
+ )
62
+ code_text = code_response.choices[0].message.content.strip()
63
+
64
+ if code_text.startswith("```"):
65
+ lines = code_text.split('\n')
66
+ code_text = '\n'.join(lines[1:-1])
67
+
68
+ # Step 3: Config generate karo
69
+ config_prompt = f"""
70
+ Based on this research paper, extract all hyperparameters and write a Python config dataclass.
71
+
72
+ Paper content:
73
+ {core_content[:2000]}
74
+
75
+ Return ONLY a Python dataclass with all hyperparameters found. Use @dataclass decorator.
76
+ No explanation, no markdown backticks.
77
+ """
78
+
79
+ config_response = client.chat.completions.create(
80
+ model="llama-3.3-70b-versatile",
81
+ messages=[{"role": "user", "content": config_prompt}],
82
+ max_tokens=1000
83
+ )
84
+ config_text = config_response.choices[0].message.content.strip()
85
+
86
+ if config_text.startswith("```"):
87
+ lines = config_text.split('\n')
88
+ config_text = '\n'.join(lines[1:-1])
89
+
90
+ return {
91
+ "architecture_info": arch_text,
92
+ "model_code": code_text,
93
+ "config_code": config_text
94
+ }
@@ -0,0 +1,50 @@
1
+ import fitz
2
+ import os
3
+
4
+ def extract_text_from_pdf(pdf_path: str) -> dict:
5
+ if not os.path.exists(pdf_path):
6
+ raise FileNotFoundError(f"PDF not found: {pdf_path}")
7
+
8
+ doc = fitz.open(pdf_path)
9
+
10
+ full_text = ""
11
+ pages = []
12
+
13
+ for page_num, page in enumerate(doc):
14
+ text = page.get_text()
15
+ pages.append({
16
+ "page": page_num + 1,
17
+ "text": text
18
+ })
19
+ full_text += text + "\n"
20
+
21
+ doc.close()
22
+
23
+ return {
24
+ "full_text": full_text,
25
+ "pages": pages,
26
+ "total_pages": len(pages)
27
+ }
28
+
29
+
30
+ def extract_relevant_sections(full_text: str) -> str:
31
+ keywords = [
32
+ "abstract", "introduction", "method", "methodology",
33
+ "architecture", "model", "approach", "proposed",
34
+ "network", "layer", "attention", "encoder", "decoder"
35
+ ]
36
+
37
+ lines = full_text.split('\n')
38
+ relevant_lines = []
39
+ capture = False
40
+
41
+ for line in lines:
42
+ line_lower = line.lower().strip()
43
+
44
+ if any(kw in line_lower for kw in keywords) and len(line.strip()) < 60:
45
+ capture = True
46
+
47
+ if capture:
48
+ relevant_lines.append(line)
49
+
50
+ return '\n'.join(relevant_lines)
@@ -0,0 +1,76 @@
1
+ import ast
2
+ import torch
3
+ import sys
4
+
5
+
6
+ def validate_syntax(code: str) -> dict:
7
+ """
8
+ Python syntax check karta hai
9
+ """
10
+ try:
11
+ ast.parse(code)
12
+ return {
13
+ "valid": True,
14
+ "error": None
15
+ }
16
+ except SyntaxError as e:
17
+ return {
18
+ "valid": False,
19
+ "error": f"Syntax error at line {e.lineno}: {e.msg}"
20
+ }
21
+
22
+
23
+ def validate_pytorch(code: str) -> dict:
24
+ """
25
+ PyTorch import aur basic structure check karta hai
26
+ """
27
+ issues = []
28
+
29
+ # torch import check
30
+ if "import torch" not in code:
31
+ issues.append("torch not imported")
32
+
33
+ # nn.Module check
34
+ if "nn.Module" not in code:
35
+ issues.append("No nn.Module class found")
36
+
37
+ # forward method check
38
+ if "def forward" not in code:
39
+ issues.append("No forward method found")
40
+
41
+ # __init__ check
42
+ if "def __init__" not in code:
43
+ issues.append("No __init__ method found")
44
+
45
+ return {
46
+ "valid": len(issues) == 0,
47
+ "issues": issues
48
+ }
49
+
50
+
51
+ def validate_code(code: str) -> dict:
52
+ """
53
+ Full validation — syntax + pytorch structure
54
+ """
55
+ syntax_result = validate_syntax(code)
56
+
57
+ if not syntax_result["valid"]:
58
+ return {
59
+ "passed": False,
60
+ "syntax": syntax_result,
61
+ "pytorch": None,
62
+ "summary": f"Syntax error: {syntax_result['error']}"
63
+ }
64
+
65
+ pytorch_result = validate_pytorch(code)
66
+
67
+ passed = pytorch_result["valid"]
68
+
69
+ summary = "All checks passed!" if passed else f"Issues: {', '.join(pytorch_result['issues'])}"
70
+
71
+ return {
72
+ "passed": passed,
73
+ "syntax": syntax_result,
74
+ "pytorch": pytorch_result,
75
+ "summary": summary
76
+ }
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: paper2torch
3
+ Version: 0.1.0
4
+ Summary: Convert research papers to PyTorch code using LLMs
5
+ Home-page: https://github.com/karmagodjs/paper2torch
6
+ Author: Dhruv Kumar
7
+ Author-email: rafftarsingh7982@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: groq
15
+ Requires-Dist: pymupdf
16
+ Requires-Dist: click
17
+ Requires-Dist: rich
18
+ Requires-Dist: python-dotenv
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description-content-type
23
+ Dynamic: home-page
24
+ Dynamic: requires-dist
25
+ Dynamic: requires-python
26
+ Dynamic: summary
@@ -0,0 +1,14 @@
1
+ README.md
2
+ setup.py
3
+ paper2torch/__init__.py
4
+ paper2torch/cli.py
5
+ paper2torch/extractor.py
6
+ paper2torch/generator.py
7
+ paper2torch/parser.py
8
+ paper2torch/validator.py
9
+ paper2torch.egg-info/PKG-INFO
10
+ paper2torch.egg-info/SOURCES.txt
11
+ paper2torch.egg-info/dependency_links.txt
12
+ paper2torch.egg-info/entry_points.txt
13
+ paper2torch.egg-info/requires.txt
14
+ paper2torch.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ paper2torch = paper2torch.cli:main
@@ -0,0 +1,5 @@
1
+ groq
2
+ pymupdf
3
+ click
4
+ rich
5
+ python-dotenv
@@ -0,0 +1 @@
1
+ paper2torch
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,35 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as f:
4
+ long_description = f.read()
5
+
6
+ setup(
7
+ name="paper2torch",
8
+ version="0.1.0",
9
+ author="Dhruv Kumar",
10
+ author_email="rafftarsingh7982@gmail.com",
11
+ description="Convert research papers to PyTorch code using LLMs",
12
+ long_description=long_description,
13
+ long_description_content_type="text/markdown",
14
+ url="https://github.com/karmagodjs/paper2torch",
15
+ packages=find_packages(),
16
+ install_requires=[
17
+ "groq",
18
+ "pymupdf",
19
+ "click",
20
+ "rich",
21
+ "python-dotenv",
22
+ ],
23
+ entry_points={
24
+ "console_scripts": [
25
+ "paper2torch=paper2torch.cli:main",
26
+ ],
27
+ },
28
+ python_requires=">=3.8",
29
+ classifiers=[
30
+ "Programming Language :: Python :: 3",
31
+ "License :: OSI Approved :: MIT License",
32
+ "Operating System :: OS Independent",
33
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
34
+ ],
35
+ )