puralang-engine 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puralang_engine-1.0.0/LICENSE +21 -0
- puralang_engine-1.0.0/PKG-INFO +242 -0
- puralang_engine-1.0.0/README.md +215 -0
- puralang_engine-1.0.0/puralang/__init__.py +1 -0
- puralang_engine-1.0.0/puralang/cli.py +25 -0
- puralang_engine-1.0.0/puralang/core.py +154 -0
- puralang_engine-1.0.0/puralang_engine.egg-info/PKG-INFO +242 -0
- puralang_engine-1.0.0/puralang_engine.egg-info/SOURCES.txt +12 -0
- puralang_engine-1.0.0/puralang_engine.egg-info/dependency_links.txt +1 -0
- puralang_engine-1.0.0/puralang_engine.egg-info/entry_points.txt +2 -0
- puralang_engine-1.0.0/puralang_engine.egg-info/requires.txt +5 -0
- puralang_engine-1.0.0/puralang_engine.egg-info/top_level.txt +1 -0
- puralang_engine-1.0.0/setup.cfg +4 -0
- puralang_engine-1.0.0/setup.py +38 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 SAI DARSINI S
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: puralang_engine
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An AI-driven domain-specific language engine for automated data cleaning pipelines.
|
|
5
|
+
Home-page: https://github.com/SaiDarsini/puralang_engine
|
|
6
|
+
Author: Sai Darsini
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: lark
|
|
14
|
+
Requires-Dist: rich
|
|
15
|
+
Requires-Dist: pandas
|
|
16
|
+
Requires-Dist: typer
|
|
17
|
+
Requires-Dist: google-genai
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: requires-python
|
|
26
|
+
Dynamic: summary
|
|
27
|
+
|
|
28
|
+
# PuraLang Engine ๐
|
|
29
|
+
|
|
30
|
+
> **An AI-powered Domain-Specific Language for automated data cleaning pipelines.**
|
|
31
|
+
> Describe your data problem in plain English โ PuraLang writes and runs the pipeline for you.
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<img src="https://img.shields.io/badge/version-0.1.0-blue?style=for-the-badge" />
|
|
35
|
+
<img src="https://img.shields.io/badge/python-3.8+-green?style=for-the-badge" />
|
|
36
|
+
<img src="https://img.shields.io/badge/license-MIT-orange?style=for-the-badge" />
|
|
37
|
+
<img src="https://img.shields.io/badge/status-active-brightgreen?style=for-the-badge" />
|
|
38
|
+
</p>
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## What is PuraLang?
|
|
43
|
+
|
|
44
|
+
PuraLang is a custom programming language built specifically for **data cleaning**. Instead of writing 30โ50 lines of Python/Pandas code every time you need to clean a dataset, you write a clean, human-readable `.pura` script โ or better yet, just **describe what you want in English** and let the AI generate the script for you.
|
|
45
|
+
|
|
46
|
+
### The Problem It Solves
|
|
47
|
+
|
|
48
|
+
Every data engineer and ML practitioner spends hours writing repetitive boilerplate code like this:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
import pandas as pd
|
|
52
|
+
df = pd.read_csv("users.csv")
|
|
53
|
+
df = df.drop_duplicates(subset=["user_id"])
|
|
54
|
+
df["age"] = df["age"].fillna(24)
|
|
55
|
+
df["email"] = df["email"].str.strip().str.lower()
|
|
56
|
+
df.to_csv("clean_users.csv", index=False)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
With PuraLang, the same result is achieved in 5 readable lines:
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
LOAD "users.csv"
|
|
63
|
+
|> DROP_DUPLICATES "user_id"
|
|
64
|
+
|> FILL_NULLS "age" VALUE 24
|
|
65
|
+
|> FORMAT_STRINGS "email" TO LOWERCASE
|
|
66
|
+
|> EXPORT_CSV "clean_users.csv"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
And with **AI Mode**, you don't even need to write that:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
puralang ask "clean users.csv โ remove duplicate user IDs, fill missing ages with 24, lowercase all emails"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Features
|
|
78
|
+
|
|
79
|
+
- **Custom DSL Syntax** โ Clean, readable pipeline syntax with `|>` operators
|
|
80
|
+
- **AI Mode** โ Describe your cleaning task in plain English; the engine generates and runs the script automatically
|
|
81
|
+
- **Visual Execution Trace** โ Beautiful terminal output showing row counts before and after every operation
|
|
82
|
+
- **Multiple Operations** โ DROP_DUPLICATES, FILL_NULLS, FORMAT_STRINGS, FILTER_ROWS, EXPORT_CSV
|
|
83
|
+
- **Zero Boilerplate** โ No Pandas knowledge required to use it
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Installation
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install puralang-engine
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Or clone and run locally:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/SaiDarsini/puralang_engine.git
|
|
97
|
+
cd puralang_engine
|
|
98
|
+
pip install -r requirements.txt
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Quick Start
|
|
104
|
+
|
|
105
|
+
### Manual Mode โ Write a `.pura` script
|
|
106
|
+
|
|
107
|
+
Create a file called `pipeline.pura`:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
LOAD "dirty_data.csv"
|
|
111
|
+
|> DROP_DUPLICATES "user_id"
|
|
112
|
+
|> FILL_NULLS "age" VALUE 24
|
|
113
|
+
|> FORMAT_STRINGS "email" TO LOWERCASE
|
|
114
|
+
|> EXPORT_CSV "cleaned_output.csv"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Run it:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
puralang run pipeline.pura
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### AI Mode โ Describe it in English
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
puralang ask "load sales.csv, remove duplicate order IDs, fill missing prices with 0, export to clean_sales.csv"
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
PuraLang will:
|
|
130
|
+
1. Send your description to an AI model
|
|
131
|
+
2. Show you the generated `.pura` script
|
|
132
|
+
3. Execute it automatically
|
|
133
|
+
4. Print the trace report
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Execution Output
|
|
138
|
+
|
|
139
|
+
Every pipeline run produces a visual trace table in your terminal:
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโ PuraLang Execution Trace โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
143
|
+
โ Operation โ Rows Before โ Rows After โ
|
|
144
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโผโโโโโโโโโโโโโค
|
|
145
|
+
โ LOAD SOURCE DATA โ - โ 4 โ
|
|
146
|
+
โ DROP DUPLICATES [user_id] โ 4 โ 3 โ
|
|
147
|
+
โ FILL NULL FIELDS [age] โ 3 โ 3 โ
|
|
148
|
+
โ STRING TRANSFORM [email] โ 3 โ 3 โ
|
|
149
|
+
โ EXPORT COMPILED FILE โ 3 โ 3 โ
|
|
150
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโดโโโโโโโโโโโโโ
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Supported Operations
|
|
156
|
+
|
|
157
|
+
| Operation | Syntax | Description |
|
|
158
|
+
|-----------|--------|-------------|
|
|
159
|
+
| Load CSV | `LOAD "file.csv"` | Load a CSV file into the pipeline |
|
|
160
|
+
| Drop Duplicates | `DROP_DUPLICATES "column"` | Remove duplicate rows based on a column |
|
|
161
|
+
| Fill Nulls | `FILL_NULLS "column" VALUE 0` | Fill missing values with a default |
|
|
162
|
+
| Format Strings | `FORMAT_STRINGS "column" TO LOWERCASE` | Normalize text casing |
|
|
163
|
+
| Filter Rows | `FILTER_ROWS "column" GREATER_THAN 18` | Filter rows by condition |
|
|
164
|
+
| Export CSV | `EXPORT_CSV "output.csv"` | Save cleaned data to a new file |
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Project Architecture
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
puralang_engine/
|
|
172
|
+
โ
|
|
173
|
+
โโโ puralang/
|
|
174
|
+
โ โโโ __init__.py # Package version
|
|
175
|
+
โ โโโ core.py # Lark grammar, parser, and transformer engine
|
|
176
|
+
โ โโโ cli.py # Typer CLI โ run and ask commands
|
|
177
|
+
โ
|
|
178
|
+
โโโ tests/
|
|
179
|
+
โ โโโ sample.pura # Example PuraLang script
|
|
180
|
+
โ
|
|
181
|
+
โโโ setup.py # PyPI packaging config
|
|
182
|
+
โโโ README.md
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
The engine works in 3 stages:
|
|
186
|
+
|
|
187
|
+
```
|
|
188
|
+
.pura script โ [Lark Parser] โ Abstract Syntax Tree โ [Transformer] โ Pandas execution โ Clean CSV
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
For AI Mode:
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
English prompt โ [LLM API] โ .pura script โ [Engine] โ Clean CSV
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Tech Stack
|
|
200
|
+
|
|
201
|
+
- **Lark** โ Grammar definition and parsing
|
|
202
|
+
- **Pandas** โ Underlying data manipulation engine
|
|
203
|
+
- **Rich** โ Beautiful terminal output and trace tables
|
|
204
|
+
- **Typer** โ CLI command interface
|
|
205
|
+
- **Google Gemini API** โ AI script generation (AI Mode)
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Roadmap
|
|
210
|
+
|
|
211
|
+
- [x] Core DSL parser and transformer
|
|
212
|
+
- [x] CLI with `run` command
|
|
213
|
+
- [x] AI Mode with `ask` command
|
|
214
|
+
- [x] Visual execution trace table
|
|
215
|
+
- [ ] PyPI public release (`pip install puralang-engine`)
|
|
216
|
+
- [ ] Support for JSON and Excel input formats
|
|
217
|
+
- [ ] Web UI for non-technical users
|
|
218
|
+
- [ ] VS Code extension with `.pura` syntax highlighting
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## About the Author
|
|
223
|
+
|
|
224
|
+
**Sai Darsini Sathuluru**
|
|
225
|
+
B.Tech Student | Mohan Babu University | Generative AI Intern @ Prodigy InfoTech
|
|
226
|
+
Founder & Core Architect of PuraLang Engine
|
|
227
|
+
|
|
228
|
+
- GitHub: [@SaiDarsini](https://github.com/SaiDarsini)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
This project is licensed under the **MIT License** โ you are free to use, modify, and distribute it.
|
|
236
|
+
See the [LICENSE](LICENSE) file for details.
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
<p align="center">
|
|
241
|
+
Built with โค๏ธ by Sai Darsini ยท If this helped you, please โญ the repo!
|
|
242
|
+
</p>
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# PuraLang Engine ๐
|
|
2
|
+
|
|
3
|
+
> **An AI-powered Domain-Specific Language for automated data cleaning pipelines.**
|
|
4
|
+
> Describe your data problem in plain English โ PuraLang writes and runs the pipeline for you.
|
|
5
|
+
|
|
6
|
+
<p align="center">
|
|
7
|
+
<img src="https://img.shields.io/badge/version-0.1.0-blue?style=for-the-badge" />
|
|
8
|
+
<img src="https://img.shields.io/badge/python-3.8+-green?style=for-the-badge" />
|
|
9
|
+
<img src="https://img.shields.io/badge/license-MIT-orange?style=for-the-badge" />
|
|
10
|
+
<img src="https://img.shields.io/badge/status-active-brightgreen?style=for-the-badge" />
|
|
11
|
+
</p>
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## What is PuraLang?
|
|
16
|
+
|
|
17
|
+
PuraLang is a custom programming language built specifically for **data cleaning**. Instead of writing 30โ50 lines of Python/Pandas code every time you need to clean a dataset, you write a clean, human-readable `.pura` script โ or better yet, just **describe what you want in English** and let the AI generate the script for you.
|
|
18
|
+
|
|
19
|
+
### The Problem It Solves
|
|
20
|
+
|
|
21
|
+
Every data engineer and ML practitioner spends hours writing repetitive boilerplate code like this:
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
import pandas as pd
|
|
25
|
+
df = pd.read_csv("users.csv")
|
|
26
|
+
df = df.drop_duplicates(subset=["user_id"])
|
|
27
|
+
df["age"] = df["age"].fillna(24)
|
|
28
|
+
df["email"] = df["email"].str.strip().str.lower()
|
|
29
|
+
df.to_csv("clean_users.csv", index=False)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
With PuraLang, the same result is achieved in 5 readable lines:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
LOAD "users.csv"
|
|
36
|
+
|> DROP_DUPLICATES "user_id"
|
|
37
|
+
|> FILL_NULLS "age" VALUE 24
|
|
38
|
+
|> FORMAT_STRINGS "email" TO LOWERCASE
|
|
39
|
+
|> EXPORT_CSV "clean_users.csv"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
And with **AI Mode**, you don't even need to write that:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
puralang ask "clean users.csv โ remove duplicate user IDs, fill missing ages with 24, lowercase all emails"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- **Custom DSL Syntax** โ Clean, readable pipeline syntax with `|>` operators
|
|
53
|
+
- **AI Mode** โ Describe your cleaning task in plain English; the engine generates and runs the script automatically
|
|
54
|
+
- **Visual Execution Trace** โ Beautiful terminal output showing row counts before and after every operation
|
|
55
|
+
- **Multiple Operations** โ DROP_DUPLICATES, FILL_NULLS, FORMAT_STRINGS, FILTER_ROWS, EXPORT_CSV
|
|
56
|
+
- **Zero Boilerplate** โ No Pandas knowledge required to use it
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install puralang-engine
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Or clone and run locally:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
git clone https://github.com/SaiDarsini/puralang_engine.git
|
|
70
|
+
cd puralang_engine
|
|
71
|
+
pip install -r requirements.txt
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Quick Start
|
|
77
|
+
|
|
78
|
+
### Manual Mode โ Write a `.pura` script
|
|
79
|
+
|
|
80
|
+
Create a file called `pipeline.pura`:
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
LOAD "dirty_data.csv"
|
|
84
|
+
|> DROP_DUPLICATES "user_id"
|
|
85
|
+
|> FILL_NULLS "age" VALUE 24
|
|
86
|
+
|> FORMAT_STRINGS "email" TO LOWERCASE
|
|
87
|
+
|> EXPORT_CSV "cleaned_output.csv"
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Run it:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
puralang run pipeline.pura
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### AI Mode โ Describe it in English
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
puralang ask "load sales.csv, remove duplicate order IDs, fill missing prices with 0, export to clean_sales.csv"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
PuraLang will:
|
|
103
|
+
1. Send your description to an AI model
|
|
104
|
+
2. Show you the generated `.pura` script
|
|
105
|
+
3. Execute it automatically
|
|
106
|
+
4. Print the trace report
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Execution Output
|
|
111
|
+
|
|
112
|
+
Every pipeline run produces a visual trace table in your terminal:
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโ PuraLang Execution Trace โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
116
|
+
โ Operation โ Rows Before โ Rows After โ
|
|
117
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโผโโโโโโโโโโโโโค
|
|
118
|
+
โ LOAD SOURCE DATA โ - โ 4 โ
|
|
119
|
+
โ DROP DUPLICATES [user_id] โ 4 โ 3 โ
|
|
120
|
+
โ FILL NULL FIELDS [age] โ 3 โ 3 โ
|
|
121
|
+
โ STRING TRANSFORM [email] โ 3 โ 3 โ
|
|
122
|
+
โ EXPORT COMPILED FILE โ 3 โ 3 โ
|
|
123
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโดโโโโโโโโโโโโโ
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Supported Operations
|
|
129
|
+
|
|
130
|
+
| Operation | Syntax | Description |
|
|
131
|
+
|-----------|--------|-------------|
|
|
132
|
+
| Load CSV | `LOAD "file.csv"` | Load a CSV file into the pipeline |
|
|
133
|
+
| Drop Duplicates | `DROP_DUPLICATES "column"` | Remove duplicate rows based on a column |
|
|
134
|
+
| Fill Nulls | `FILL_NULLS "column" VALUE 0` | Fill missing values with a default |
|
|
135
|
+
| Format Strings | `FORMAT_STRINGS "column" TO LOWERCASE` | Normalize text casing |
|
|
136
|
+
| Filter Rows | `FILTER_ROWS "column" GREATER_THAN 18` | Filter rows by condition |
|
|
137
|
+
| Export CSV | `EXPORT_CSV "output.csv"` | Save cleaned data to a new file |
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Project Architecture
|
|
142
|
+
|
|
143
|
+
```
|
|
144
|
+
puralang_engine/
|
|
145
|
+
โ
|
|
146
|
+
โโโ puralang/
|
|
147
|
+
โ โโโ __init__.py # Package version
|
|
148
|
+
โ โโโ core.py # Lark grammar, parser, and transformer engine
|
|
149
|
+
โ โโโ cli.py # Typer CLI โ run and ask commands
|
|
150
|
+
โ
|
|
151
|
+
โโโ tests/
|
|
152
|
+
โ โโโ sample.pura # Example PuraLang script
|
|
153
|
+
โ
|
|
154
|
+
โโโ setup.py # PyPI packaging config
|
|
155
|
+
โโโ README.md
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
The engine works in 3 stages:
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
.pura script โ [Lark Parser] โ Abstract Syntax Tree โ [Transformer] โ Pandas execution โ Clean CSV
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
For AI Mode:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
English prompt โ [LLM API] โ .pura script โ [Engine] โ Clean CSV
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Tech Stack
|
|
173
|
+
|
|
174
|
+
- **Lark** โ Grammar definition and parsing
|
|
175
|
+
- **Pandas** โ Underlying data manipulation engine
|
|
176
|
+
- **Rich** โ Beautiful terminal output and trace tables
|
|
177
|
+
- **Typer** โ CLI command interface
|
|
178
|
+
- **Google Gemini API** โ AI script generation (AI Mode)
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## Roadmap
|
|
183
|
+
|
|
184
|
+
- [x] Core DSL parser and transformer
|
|
185
|
+
- [x] CLI with `run` command
|
|
186
|
+
- [x] AI Mode with `ask` command
|
|
187
|
+
- [x] Visual execution trace table
|
|
188
|
+
- [ ] PyPI public release (`pip install puralang-engine`)
|
|
189
|
+
- [ ] Support for JSON and Excel input formats
|
|
190
|
+
- [ ] Web UI for non-technical users
|
|
191
|
+
- [ ] VS Code extension with `.pura` syntax highlighting
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## About the Author
|
|
196
|
+
|
|
197
|
+
**Sai Darsini Sathuluru**
|
|
198
|
+
B.Tech Student | Mohan Babu University | Generative AI Intern @ Prodigy InfoTech
|
|
199
|
+
Founder & Core Architect of PuraLang Engine
|
|
200
|
+
|
|
201
|
+
- GitHub: [@SaiDarsini](https://github.com/SaiDarsini)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## License
|
|
207
|
+
|
|
208
|
+
This project is licensed under the **MIT License** โ you are free to use, modify, and distribute it.
|
|
209
|
+
See the [LICENSE](LICENSE) file for details.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
<p align="center">
|
|
214
|
+
Built with โค๏ธ by Sai Darsini ยท If this helped you, please โญ the repo!
|
|
215
|
+
</p>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from puralang.core import run_pure_script, ask_ai_to_clean
|
|
3
|
+
|
|
4
|
+
app = typer.Typer(help="PuraLang CLI Framework")
|
|
5
|
+
|
|
6
|
+
@app.command()
|
|
7
|
+
def run(script_path: str = typer.Argument(..., help="Path targeting your local .pura script file.")):
|
|
8
|
+
"""Runs a local manual PuraLang script file."""
|
|
9
|
+
try:
|
|
10
|
+
with open(script_path, "r", encoding="utf-8") as f:
|
|
11
|
+
content = f.read()
|
|
12
|
+
run_pure_script(content)
|
|
13
|
+
except Exception as e:
|
|
14
|
+
typer.secho(f"Compiler Error: {e}", fg=typer.colors.RED, bold=True)
|
|
15
|
+
|
|
16
|
+
@app.command()
|
|
17
|
+
def ask(prompt: str = typer.Argument(..., help="Describe your data cleaning goals in plain English.")):
|
|
18
|
+
"""AI Mode: Describe your data cleaning goals in plain English."""
|
|
19
|
+
try:
|
|
20
|
+
ask_ai_to_clean(prompt)
|
|
21
|
+
except Exception as e:
|
|
22
|
+
typer.secho(f"AI Pipeline Execution Failure: {e}", fg=typer.colors.RED, bold=True)
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
app()
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from lark import Lark, Transformer
|
|
5
|
+
from google import genai
|
|
6
|
+
from google.genai import types
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
puralang_grammar = """
|
|
11
|
+
start: pipeline
|
|
12
|
+
pipeline: load_stmt (pipe step_stmt)*
|
|
13
|
+
|
|
14
|
+
load_stmt: "LOAD" ESCAPED_STRING
|
|
15
|
+
step_stmt: drop_dup | fill_null | format_str | filter_rows | rename_col | export_csv
|
|
16
|
+
|
|
17
|
+
drop_dup: "DROP_DUPLICATES" ESCAPED_STRING
|
|
18
|
+
fill_null: "FILL_NULLS" ESCAPED_STRING "VALUE" (ESCAPED_STRING | NUMBER)
|
|
19
|
+
format_str: "FORMAT_STRINGS" ESCAPED_STRING "TO" CASE_ACTION
|
|
20
|
+
filter_rows: "FILTER_ROWS" ESCAPED_STRING OPERATOR NUMBER
|
|
21
|
+
rename_col: "RENAME_COLUMN" ESCAPED_STRING "TO" ESCAPED_STRING
|
|
22
|
+
export_csv: "EXPORT_CSV" ESCAPED_STRING
|
|
23
|
+
|
|
24
|
+
pipe: "|>"
|
|
25
|
+
CASE_ACTION: "LOWERCASE" | "UPPERCASE"
|
|
26
|
+
OPERATOR: ">" | "<" | "==" | "!="
|
|
27
|
+
|
|
28
|
+
%import common.ESCAPED_STRING
|
|
29
|
+
%import common.NUMBER
|
|
30
|
+
%import common.WS
|
|
31
|
+
%ignore WS
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
class PuraTransformer(Transformer):
|
|
35
|
+
def __init__(self):
|
|
36
|
+
self.df = None
|
|
37
|
+
self.console = Console()
|
|
38
|
+
self.report = Table(title="PuraLang Execution Trace", show_header=True, header_style="bold cyan")
|
|
39
|
+
self.report.add_column("Pipeline Step", style="magenta")
|
|
40
|
+
self.report.add_column("Rows Before", style="yellow")
|
|
41
|
+
self.report.add_column("Rows After", style="green")
|
|
42
|
+
|
|
43
|
+
def load_stmt(self, items):
|
|
44
|
+
filename = items[0].strip('"')
|
|
45
|
+
if not os.path.exists(filename):
|
|
46
|
+
raise FileNotFoundError(f"Missing input dataset file: '{filename}'")
|
|
47
|
+
self.df = pd.read_csv(filename)
|
|
48
|
+
self.report.add_row("LOAD SOURCE DATA", "-", str(len(self.df)))
|
|
49
|
+
return self.df
|
|
50
|
+
|
|
51
|
+
def drop_dup(self, items):
|
|
52
|
+
col = items[0].strip('"')
|
|
53
|
+
before = len(self.df)
|
|
54
|
+
self.df = self.df.drop_duplicates(subset=[col])
|
|
55
|
+
self.report.add_row(f"DROP DUPLICATES [{col}]", str(before), str(len(self.df)))
|
|
56
|
+
|
|
57
|
+
def fill_null(self, items):
|
|
58
|
+
col = items[0].strip('"')
|
|
59
|
+
raw_val = items[1]
|
|
60
|
+
val = raw_val.strip('"') if hasattr(raw_val, 'type') and raw_val.type == "ESCAPED_STRING" else float(raw_val)
|
|
61
|
+
before = len(self.df)
|
|
62
|
+
self.df[col] = self.df[col].fillna(val)
|
|
63
|
+
self.report.add_row(f"FILL NULL FIELDS [{col}]", str(before), str(len(self.df)))
|
|
64
|
+
|
|
65
|
+
def format_str(self, items):
|
|
66
|
+
col = items[0].strip('"')
|
|
67
|
+
case_type = str(items[1]).strip()
|
|
68
|
+
before = len(self.df)
|
|
69
|
+
if "LOWERCASE" in case_type:
|
|
70
|
+
self.df[col] = self.df[col].astype(str).str.strip().str.lower()
|
|
71
|
+
else:
|
|
72
|
+
self.df[col] = self.df[col].astype(str).str.strip().str.upper()
|
|
73
|
+
self.report.add_row(f"STRING TRANSFORM [{col} -> {case_type}]", str(before), str(len(self.df)))
|
|
74
|
+
|
|
75
|
+
def filter_rows(self, items):
|
|
76
|
+
col = items[0].strip('"')
|
|
77
|
+
op = str(items[1]).strip()
|
|
78
|
+
val = float(items[2])
|
|
79
|
+
before = len(self.df)
|
|
80
|
+
|
|
81
|
+
# Dynamic pandas evaluation bypass
|
|
82
|
+
if op == ">": self.df = self.df[self.df[col] > val]
|
|
83
|
+
elif op == "<": self.df = self.df[self.df[col] < val]
|
|
84
|
+
elif op == "==": self.df = self.df[self.df[col] == val]
|
|
85
|
+
elif op == "!=": self.df = self.df[self.df[col] != val]
|
|
86
|
+
|
|
87
|
+
self.report.add_row(f"FILTER ROWS [{col} {op} {val}]", str(before), str(len(self.df)))
|
|
88
|
+
|
|
89
|
+
def rename_col(self, items):
|
|
90
|
+
old_col = items[0].strip('"')
|
|
91
|
+
new_col = items[1].strip('"')
|
|
92
|
+
before = len(self.df)
|
|
93
|
+
self.df = self.df.rename(columns={old_col: new_col})
|
|
94
|
+
self.report.add_row(f"RENAME COLUMN [{old_col} -> {new_col}]", str(before), str(len(self.df)))
|
|
95
|
+
|
|
96
|
+
def export_csv(self, items):
|
|
97
|
+
filename = items[0].strip('"')
|
|
98
|
+
self.df.to_csv(filename, index=False)
|
|
99
|
+
self.report.add_row(f"EXPORT COMPILED FILE", str(len(self.df)), str(len(self.df)))
|
|
100
|
+
|
|
101
|
+
def pipeline(self, items):
|
|
102
|
+
self.console.print(self.report)
|
|
103
|
+
return self.df
|
|
104
|
+
|
|
105
|
+
def run_pure_script(script_content: str):
|
|
106
|
+
parser = Lark(puralang_grammar, start='start')
|
|
107
|
+
tree = parser.parse(script_content)
|
|
108
|
+
transformer = PuraTransformer()
|
|
109
|
+
transformer.transform(tree)
|
|
110
|
+
|
|
111
|
+
def ask_ai_to_clean(user_prompt: str):
|
|
112
|
+
api_key = os.getenv("GEMINI_API_KEY")
|
|
113
|
+
if not api_key:
|
|
114
|
+
print("โ Error: GEMINI_API_KEY environment variable is not set.")
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
client = genai.Client(api_key=api_key)
|
|
118
|
+
|
|
119
|
+
system_instruction = """
|
|
120
|
+
You are the natural language translation engine for PuraLang, a data-cleaning DSL.
|
|
121
|
+
Your sole job is to translate a user's plain English cleaning request into a valid PuraLang script.
|
|
122
|
+
|
|
123
|
+
VALID KEYWORDS AND SYNTAX:
|
|
124
|
+
LOAD "filename.csv"
|
|
125
|
+
|> DROP_DUPLICATES "column_name"
|
|
126
|
+
|> FILL_NULLS "column_name" VALUE 20
|
|
127
|
+
|> FORMAT_STRINGS "column_name" TO LOWERCASE
|
|
128
|
+
|> FILTER_ROWS "column_name" > 50
|
|
129
|
+
|> RENAME_COLUMN "old_name" TO "new_name"
|
|
130
|
+
|> EXPORT_CSV "output.csv"
|
|
131
|
+
|
|
132
|
+
CRITICAL INSTRUCTIONS:
|
|
133
|
+
- Respond ONLY with the raw PuraLang code script.
|
|
134
|
+
- Do NOT include markdown code blocks like ```text or ```puralang.
|
|
135
|
+
- Do NOT write any conversational filler text.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
print("๐ค Consulting PuraLang AI Agent...")
|
|
139
|
+
response = client.models.generate_content(
|
|
140
|
+
model='gemini-2.5-flash',
|
|
141
|
+
contents=f"Translate this cleaning request into PuraLang code:\n\n{user_prompt}",
|
|
142
|
+
config=types.GenerateContentConfig(
|
|
143
|
+
system_instruction=system_instruction,
|
|
144
|
+
temperature=0.1
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
generated_code = response.text.strip()
|
|
149
|
+
|
|
150
|
+
console = Console()
|
|
151
|
+
console.print("\n[bold green]โจ AI Generated PuraLang Script Layout:[/bold green]")
|
|
152
|
+
console.print(f"[dim]{generated_code}[/dim]\n")
|
|
153
|
+
|
|
154
|
+
run_pure_script(generated_code)
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: puralang_engine
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An AI-driven domain-specific language engine for automated data cleaning pipelines.
|
|
5
|
+
Home-page: https://github.com/SaiDarsini/puralang_engine
|
|
6
|
+
Author: Sai Darsini
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: lark
|
|
14
|
+
Requires-Dist: rich
|
|
15
|
+
Requires-Dist: pandas
|
|
16
|
+
Requires-Dist: typer
|
|
17
|
+
Requires-Dist: google-genai
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: requires-python
|
|
26
|
+
Dynamic: summary
|
|
27
|
+
|
|
28
|
+
# PuraLang Engine ๐
|
|
29
|
+
|
|
30
|
+
> **An AI-powered Domain-Specific Language for automated data cleaning pipelines.**
|
|
31
|
+
> Describe your data problem in plain English โ PuraLang writes and runs the pipeline for you.
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<img src="https://img.shields.io/badge/version-0.1.0-blue?style=for-the-badge" />
|
|
35
|
+
<img src="https://img.shields.io/badge/python-3.8+-green?style=for-the-badge" />
|
|
36
|
+
<img src="https://img.shields.io/badge/license-MIT-orange?style=for-the-badge" />
|
|
37
|
+
<img src="https://img.shields.io/badge/status-active-brightgreen?style=for-the-badge" />
|
|
38
|
+
</p>
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## What is PuraLang?
|
|
43
|
+
|
|
44
|
+
PuraLang is a custom programming language built specifically for **data cleaning**. Instead of writing 30โ50 lines of Python/Pandas code every time you need to clean a dataset, you write a clean, human-readable `.pura` script โ or better yet, just **describe what you want in English** and let the AI generate the script for you.
|
|
45
|
+
|
|
46
|
+
### The Problem It Solves
|
|
47
|
+
|
|
48
|
+
Every data engineer and ML practitioner spends hours writing repetitive boilerplate code like this:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
import pandas as pd
|
|
52
|
+
df = pd.read_csv("users.csv")
|
|
53
|
+
df = df.drop_duplicates(subset=["user_id"])
|
|
54
|
+
df["age"] = df["age"].fillna(24)
|
|
55
|
+
df["email"] = df["email"].str.strip().str.lower()
|
|
56
|
+
df.to_csv("clean_users.csv", index=False)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
With PuraLang, the same result is achieved in 5 readable lines:
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
LOAD "users.csv"
|
|
63
|
+
|> DROP_DUPLICATES "user_id"
|
|
64
|
+
|> FILL_NULLS "age" VALUE 24
|
|
65
|
+
|> FORMAT_STRINGS "email" TO LOWERCASE
|
|
66
|
+
|> EXPORT_CSV "clean_users.csv"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
And with **AI Mode**, you don't even need to write that:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
puralang ask "clean users.csv โ remove duplicate user IDs, fill missing ages with 24, lowercase all emails"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Features
|
|
78
|
+
|
|
79
|
+
- **Custom DSL Syntax** โ Clean, readable pipeline syntax with `|>` operators
|
|
80
|
+
- **AI Mode** โ Describe your cleaning task in plain English; the engine generates and runs the script automatically
|
|
81
|
+
- **Visual Execution Trace** โ Beautiful terminal output showing row counts before and after every operation
|
|
82
|
+
- **Multiple Operations** โ DROP_DUPLICATES, FILL_NULLS, FORMAT_STRINGS, FILTER_ROWS, EXPORT_CSV
|
|
83
|
+
- **Zero Boilerplate** โ No Pandas knowledge required to use it
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Installation
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install puralang-engine
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Or clone and run locally:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/SaiDarsini/puralang_engine.git
|
|
97
|
+
cd puralang_engine
|
|
98
|
+
pip install -r requirements.txt
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Quick Start
|
|
104
|
+
|
|
105
|
+
### Manual Mode โ Write a `.pura` script
|
|
106
|
+
|
|
107
|
+
Create a file called `pipeline.pura`:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
LOAD "dirty_data.csv"
|
|
111
|
+
|> DROP_DUPLICATES "user_id"
|
|
112
|
+
|> FILL_NULLS "age" VALUE 24
|
|
113
|
+
|> FORMAT_STRINGS "email" TO LOWERCASE
|
|
114
|
+
|> EXPORT_CSV "cleaned_output.csv"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Run it:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
puralang run pipeline.pura
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### AI Mode โ Describe it in English
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
puralang ask "load sales.csv, remove duplicate order IDs, fill missing prices with 0, export to clean_sales.csv"
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
PuraLang will:
|
|
130
|
+
1. Send your description to an AI model
|
|
131
|
+
2. Show you the generated `.pura` script
|
|
132
|
+
3. Execute it automatically
|
|
133
|
+
4. Print the trace report
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Execution Output
|
|
138
|
+
|
|
139
|
+
Every pipeline run produces a visual trace table in your terminal:
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโ PuraLang Execution Trace โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
143
|
+
โ Operation โ Rows Before โ Rows After โ
|
|
144
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโผโโโโโโโโโโโโโค
|
|
145
|
+
โ LOAD SOURCE DATA โ - โ 4 โ
|
|
146
|
+
โ DROP DUPLICATES [user_id] โ 4 โ 3 โ
|
|
147
|
+
โ FILL NULL FIELDS [age] โ 3 โ 3 โ
|
|
148
|
+
โ STRING TRANSFORM [email] โ 3 โ 3 โ
|
|
149
|
+
โ EXPORT COMPILED FILE โ 3 โ 3 โ
|
|
150
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโดโโโโโโโโโโโโโ
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Supported Operations
|
|
156
|
+
|
|
157
|
+
| Operation | Syntax | Description |
|
|
158
|
+
|-----------|--------|-------------|
|
|
159
|
+
| Load CSV | `LOAD "file.csv"` | Load a CSV file into the pipeline |
|
|
160
|
+
| Drop Duplicates | `DROP_DUPLICATES "column"` | Remove duplicate rows based on a column |
|
|
161
|
+
| Fill Nulls | `FILL_NULLS "column" VALUE 0` | Fill missing values with a default |
|
|
162
|
+
| Format Strings | `FORMAT_STRINGS "column" TO LOWERCASE` | Normalize text casing |
|
|
163
|
+
| Filter Rows | `FILTER_ROWS "column" GREATER_THAN 18` | Filter rows by condition |
|
|
164
|
+
| Export CSV | `EXPORT_CSV "output.csv"` | Save cleaned data to a new file |
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Project Architecture
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
puralang_engine/
|
|
172
|
+
โ
|
|
173
|
+
โโโ puralang/
|
|
174
|
+
โ โโโ __init__.py # Package version
|
|
175
|
+
โ โโโ core.py # Lark grammar, parser, and transformer engine
|
|
176
|
+
โ โโโ cli.py # Typer CLI โ run and ask commands
|
|
177
|
+
โ
|
|
178
|
+
โโโ tests/
|
|
179
|
+
โ โโโ sample.pura # Example PuraLang script
|
|
180
|
+
โ
|
|
181
|
+
โโโ setup.py # PyPI packaging config
|
|
182
|
+
โโโ README.md
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
The engine works in 3 stages:
|
|
186
|
+
|
|
187
|
+
```
|
|
188
|
+
.pura script โ [Lark Parser] โ Abstract Syntax Tree โ [Transformer] โ Pandas execution โ Clean CSV
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
For AI Mode:
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
English prompt โ [LLM API] โ .pura script โ [Engine] โ Clean CSV
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Tech Stack
|
|
200
|
+
|
|
201
|
+
- **Lark** โ Grammar definition and parsing
|
|
202
|
+
- **Pandas** โ Underlying data manipulation engine
|
|
203
|
+
- **Rich** โ Beautiful terminal output and trace tables
|
|
204
|
+
- **Typer** โ CLI command interface
|
|
205
|
+
- **Google Gemini API** โ AI script generation (AI Mode)
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Roadmap
|
|
210
|
+
|
|
211
|
+
- [x] Core DSL parser and transformer
|
|
212
|
+
- [x] CLI with `run` command
|
|
213
|
+
- [x] AI Mode with `ask` command
|
|
214
|
+
- [x] Visual execution trace table
|
|
215
|
+
- [ ] PyPI public release (`pip install puralang-engine`)
|
|
216
|
+
- [ ] Support for JSON and Excel input formats
|
|
217
|
+
- [ ] Web UI for non-technical users
|
|
218
|
+
- [ ] VS Code extension with `.pura` syntax highlighting
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## About the Author
|
|
223
|
+
|
|
224
|
+
**Sai Darsini Sathuluru**
|
|
225
|
+
B.Tech Student | Mohan Babu University | Generative AI Intern @ Prodigy InfoTech
|
|
226
|
+
Founder & Core Architect of PuraLang Engine
|
|
227
|
+
|
|
228
|
+
- GitHub: [@SaiDarsini](https://github.com/SaiDarsini)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
This project is licensed under the **MIT License** โ you are free to use, modify, and distribute it.
|
|
236
|
+
See the [LICENSE](LICENSE) file for details.
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
<p align="center">
|
|
241
|
+
Built with โค๏ธ by Sai Darsini ยท If this helped you, please โญ the repo!
|
|
242
|
+
</p>
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
puralang/__init__.py
|
|
5
|
+
puralang/cli.py
|
|
6
|
+
puralang/core.py
|
|
7
|
+
puralang_engine.egg-info/PKG-INFO
|
|
8
|
+
puralang_engine.egg-info/SOURCES.txt
|
|
9
|
+
puralang_engine.egg-info/dependency_links.txt
|
|
10
|
+
puralang_engine.egg-info/entry_points.txt
|
|
11
|
+
puralang_engine.egg-info/requires.txt
|
|
12
|
+
puralang_engine.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
puralang
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from setuptools import setup, find_packages
|
|
3
|
+
|
|
4
|
+
# Safely read the README file using UTF-8 encoding to avoid Windows charmap errors
|
|
5
|
+
if os.path.exists("README.md"):
|
|
6
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
7
|
+
long_description = fh.read()
|
|
8
|
+
else:
|
|
9
|
+
long_description = ""
|
|
10
|
+
|
|
11
|
+
setup(
|
|
12
|
+
name="puralang_engine",
|
|
13
|
+
version="1.0.0",
|
|
14
|
+
author="Sai Darsini",
|
|
15
|
+
description="An AI-driven domain-specific language engine for automated data cleaning pipelines.",
|
|
16
|
+
long_description=long_description,
|
|
17
|
+
long_description_content_type="text/markdown",
|
|
18
|
+
url="https://github.com/SaiDarsini/puralang_engine",
|
|
19
|
+
packages=find_packages(),
|
|
20
|
+
install_requires=[
|
|
21
|
+
"lark",
|
|
22
|
+
"rich",
|
|
23
|
+
"pandas",
|
|
24
|
+
"typer",
|
|
25
|
+
"google-genai"
|
|
26
|
+
],
|
|
27
|
+
entry_points={
|
|
28
|
+
"console_scripts": [
|
|
29
|
+
"pura=puralang.cli:app",
|
|
30
|
+
],
|
|
31
|
+
},
|
|
32
|
+
classifiers=[
|
|
33
|
+
"Programming Language :: Python :: 3",
|
|
34
|
+
"License :: OSI Approved :: MIT License",
|
|
35
|
+
"Operating System :: OS Independent",
|
|
36
|
+
],
|
|
37
|
+
python_requires=">=3.9",
|
|
38
|
+
)
|