achem 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- achem-1.0.0/LICENSE +21 -0
- achem-1.0.0/PKG-INFO +253 -0
- achem-1.0.0/README.md +208 -0
- achem-1.0.0/pyproject.toml +75 -0
- achem-1.0.0/setup.cfg +4 -0
- achem-1.0.0/src/achem/__init__.py +9 -0
- achem-1.0.0/src/achem/__main__.py +6 -0
- achem-1.0.0/src/achem/cache_manager.py +98 -0
- achem-1.0.0/src/achem/commands.py +117 -0
- achem-1.0.0/src/achem/config_manager.py +91 -0
- achem-1.0.0/src/achem/duckduckgo_client.py +179 -0
- achem-1.0.0/src/achem/export_manager.py +131 -0
- achem-1.0.0/src/achem/huggingface_summarizer.py +270 -0
- achem-1.0.0/src/achem/main.py +294 -0
- achem-1.0.0/src/achem/output_formatter.py +659 -0
- achem-1.0.0/src/achem/search_router.py +61 -0
- achem-1.0.0/src/achem/spell_checker.py +132 -0
- achem-1.0.0/src/achem/sqlite_cache.py +194 -0
- achem-1.0.0/src/achem/text_analyzer.py +337 -0
- achem-1.0.0/src/achem/user_input.py +23 -0
- achem-1.0.0/src/achem/web_scraper.py +172 -0
- achem-1.0.0/src/achem/wikipedia_client.py +123 -0
- achem-1.0.0/src/achem.egg-info/PKG-INFO +253 -0
- achem-1.0.0/src/achem.egg-info/SOURCES.txt +26 -0
- achem-1.0.0/src/achem.egg-info/dependency_links.txt +1 -0
- achem-1.0.0/src/achem.egg-info/entry_points.txt +2 -0
- achem-1.0.0/src/achem.egg-info/requires.txt +17 -0
- achem-1.0.0/src/achem.egg-info/top_level.txt +1 -0
achem-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ACHEM Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
achem-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: achem
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Deep Web Research Tool - Aggregates 30+ sources, scrapes content, generates AI summaries
|
|
5
|
+
Author: ACHEM Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/achem/achem
|
|
8
|
+
Project-URL: Documentation, https://github.com/achem/achem#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/achem/achem
|
|
10
|
+
Project-URL: Issues, https://github.com/achem/achem/issues
|
|
11
|
+
Keywords: research,deep-web,web-scraping,summarization,ai,cli,tool
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Education
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
+
Classifier: Topic :: Text Processing :: General
|
|
26
|
+
Requires-Python: >=3.10
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: wikipedia-api>=0.5.4
|
|
30
|
+
Requires-Dist: rich>=13.0.0
|
|
31
|
+
Requires-Dist: psutil>=5.9.0
|
|
32
|
+
Requires-Dist: prompt_toolkit>=3.0.0
|
|
33
|
+
Requires-Dist: pyfiglet>=0.8.0
|
|
34
|
+
Requires-Dist: openai>=1.0.0
|
|
35
|
+
Requires-Dist: ddgs>=3.0.0
|
|
36
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
37
|
+
Requires-Dist: requests>=2.31.0
|
|
38
|
+
Provides-Extra: arabic
|
|
39
|
+
Requires-Dist: arabic-reshaper>=3.0.0; extra == "arabic"
|
|
40
|
+
Requires-Dist: python-bidi>=0.14.0; extra == "arabic"
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# ACHEM - Deep Web Research Tool
|
|
47
|
+
|
|
48
|
+

|
|
49
|
+
|
|
50
|
+
> **ACHEM** (Arabic: آشم) is a powerful deep web research tool that aggregates information from 30+ sources, scrapes full content from top results, and generates concise summaries using AI.
|
|
51
|
+
|
|
52
|
+
## Features
|
|
53
|
+
|
|
54
|
+
- **Deep Web Research**: Gathers results from 30+ sources via DuckDuckGo
|
|
55
|
+
- **Web Scraping**: Extracts full content from top 3 most relevant links
|
|
56
|
+
- **Two-Pass Search**: Prioritizes technical content (StackOverflow, GitHub, forums)
|
|
57
|
+
- **AI Summarization**: Uses Hugging Face Inference Providers (free tier)
|
|
58
|
+
- **Syntax Highlighting**: Color-coded output for easy scanning
|
|
59
|
+
- **SQLite Cache**: Instant recall for repeated searches
|
|
60
|
+
- **Export**: Save summaries to Markdown files
|
|
61
|
+
- **Multi-language**: Supports English, French, and Arabic
|
|
62
|
+
|
|
63
|
+
## Screenshots
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
╔══════════════════════════════════════════════════════════════════╗
|
|
67
|
+
║ ACHEM - Deep Web Research ║
|
|
68
|
+
╚══════════════════════════════════════════════════════════════════╝
|
|
69
|
+
|
|
70
|
+
🔍 Deep Research: how to learn python
|
|
71
|
+
==================================================
|
|
72
|
+
PASS 1: Gathering 30 sources from DuckDuckGo...
|
|
73
|
+
✓ Found 30 sources
|
|
74
|
+
PASS 2: Scraped full content from top 3 links
|
|
75
|
+
→ Analyzing 35 total sources...
|
|
76
|
+
→ Generating deep summary...
|
|
77
|
+
|
|
78
|
+
╭──────────────────────────────────────────────────────────────────╮
|
|
79
|
+
│ UNIFIED RESEARCH SUMMARY │
|
|
80
|
+
├──────────────────────────────────────────────────────────────────┤
|
|
81
|
+
│ 1. Start with the official Python tutorial: │
|
|
82
|
+
│ - Visit docs.python.org/3/tutorial │
|
|
83
|
+
│ │
|
|
84
|
+
│ 2. Use free online tutorials: │
|
|
85
|
+
│ - LearnPython.org, pythonbasics.org │
|
|
86
|
+
╰──────────────────────────────────────────────────────────────────╯
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Installation
|
|
90
|
+
|
|
91
|
+
### Prerequisites
|
|
92
|
+
|
|
93
|
+
- Python 3.10 or higher
|
|
94
|
+
- pip package manager
|
|
95
|
+
|
|
96
|
+
### Quick Install (PyPI)
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install achem
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Or Install from Source
|
|
103
|
+
|
|
104
|
+
1. **Clone the repository**
|
|
105
|
+
```bash
|
|
106
|
+
git clone https://github.com/achem/achem.git
|
|
107
|
+
cd achem
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
2. **Install in editable mode**
|
|
111
|
+
```bash
|
|
112
|
+
pip install -e .
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
3. **Configure API keys**
|
|
116
|
+
```bash
|
|
117
|
+
cp .env.example .env
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Then edit `.env` and add your Hugging Face API token:
|
|
121
|
+
```env
|
|
122
|
+
HF_API_KEY=hf_your_token_here
|
|
123
|
+
HF_MODEL=Qwen/Qwen2.5-7B-Instruct
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Getting a Hugging Face API Token
|
|
127
|
+
|
|
128
|
+
1. Go to [Hugging Face](https://huggingface.co/)
|
|
129
|
+
2. Create an account (free)
|
|
130
|
+
3. Go to Settings → Access Tokens
|
|
131
|
+
4. Create a new token with "Read" permissions
|
|
132
|
+
5. Copy the token to your `.env` file
|
|
133
|
+
|
|
134
|
+
## Usage
|
|
135
|
+
|
|
136
|
+
### Interactive Mode
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
python src/main.py
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Command Line Mode
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
python src/main.py "your search query"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Options
|
|
149
|
+
|
|
150
|
+
| Option | Description | Default |
|
|
151
|
+
|--------|-------------|---------|
|
|
152
|
+
| `-l, --limit` | Wikipedia results per query | 10 |
|
|
153
|
+
| `--lang` | Language (en/fr/ar/auto) | auto |
|
|
154
|
+
| `--ddg-limit` | DuckDuckGo results | 30 |
|
|
155
|
+
| `--min-relevance` | Minimum relevance % | 0 |
|
|
156
|
+
| `--no-cache` | Skip cache | False |
|
|
157
|
+
| `--no-wikipedia` | Skip Wikipedia | False |
|
|
158
|
+
| `--clear-cache` | Clear SQLite cache | False |
|
|
159
|
+
|
|
160
|
+
### Commands (Interactive Mode)
|
|
161
|
+
|
|
162
|
+
| Command | Description |
|
|
163
|
+
|---------|-------------|
|
|
164
|
+
| `clear` / `cls` | Clear screen |
|
|
165
|
+
| `export` / `save` | Export last summary |
|
|
166
|
+
| `help` / `?` | Show help |
|
|
167
|
+
| `version` / `v` | Show version |
|
|
168
|
+
| `exit` / `quit` / `q` | Exit program |
|
|
169
|
+
|
|
170
|
+
## Project Structure
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
ACHEM/
|
|
174
|
+
├── src/
|
|
175
|
+
│ └── achem/ # Main package
|
|
176
|
+
│ ├── __init__.py
|
|
177
|
+
│ ├── main.py # Entry point
|
|
178
|
+
│ ├── commands.py # Command handler
|
|
179
|
+
│ ├── config_manager.py # Config loader
|
|
180
|
+
│ ├── duckduckgo_client.py # DDG search
|
|
181
|
+
│ ├── export_manager.py # Export to Documents/ACHEM/
|
|
182
|
+
│ ├── huggingface_summarizer.py # AI summarization
|
|
183
|
+
│ ├── output_formatter.py # Terminal UI
|
|
184
|
+
│ ├── search_router.py # Source priority
|
|
185
|
+
│ ├── sqlite_cache.py # SQLite cache
|
|
186
|
+
│ ├── spell_checker.py # Typo correction
|
|
187
|
+
│ ├── text_analyzer.py # TF-IDF analysis
|
|
188
|
+
│ ├── user_input.py # Input handler
|
|
189
|
+
│ ├── web_scraper.py # BeautifulSoup scraper
|
|
190
|
+
│ └── wikipedia_client.py # Wikipedia API
|
|
191
|
+
├── .env.example # Config template
|
|
192
|
+
├── .gitignore
|
|
193
|
+
├── LICENSE
|
|
194
|
+
├── README.md
|
|
195
|
+
└── pyproject.toml # Package metadata
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## How It Works
|
|
199
|
+
|
|
200
|
+
### Two-Pass Search System
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
┌─────────────────────────────────────────────────────┐
|
|
204
|
+
│ PASS 1: DuckDuckGo Search (30 results) │
|
|
205
|
+
│ • Prioritizes technical sites │
|
|
206
|
+
│ • Filters out cookie/login/consent pages │
|
|
207
|
+
│ • Ranks by domain authority │
|
|
208
|
+
├─────────────────────────────────────────────────────┤
|
|
209
|
+
│ PASS 2: Web Scraping (Top 3) │
|
|
210
|
+
│ • BeautifulSoup extracts full article text │
|
|
211
|
+
│ • Removes navigation/footer/scripts │
|
|
212
|
+
│ • Combines up to 10,000 chars per article │
|
|
213
|
+
├─────────────────────────────────────────────────────┤
|
|
214
|
+
│ PASS 3: AI Summarization │
|
|
215
|
+
│ • Neutral technical prompt │
|
|
216
|
+
│ • No ethical warnings or opinions │
|
|
217
|
+
│ • 500-4000 character output │
|
|
218
|
+
│ • Syntax highlighting for steps/commands │
|
|
219
|
+
└─────────────────────────────────────────────────────┘
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### Source Priority
|
|
223
|
+
|
|
224
|
+
1. **DuckDuckGo** (Primary) - Real-time web results
|
|
225
|
+
2. **Wikipedia** (Secondary) - Background concepts only
|
|
226
|
+
3. **Web Scraping** - Full content from top 3
|
|
227
|
+
|
|
228
|
+
## Export Location
|
|
229
|
+
|
|
230
|
+
Summaries are saved to:
|
|
231
|
+
- **Linux/macOS**: `~/Documents/ACHEM/`
|
|
232
|
+
- **Windows**: `C:\Users\<username>\Documents\ACHEM\`
|
|
233
|
+
|
|
234
|
+
## Disclaimer
|
|
235
|
+
|
|
236
|
+
**ACHEM is for educational and research purposes only.**
|
|
237
|
+
|
|
238
|
+
The tool aggregates publicly available information from the web. Any actions taken based on the information provided are the sole responsibility of the user. The developer is not responsible for any misuse of this tool.
|
|
239
|
+
|
|
240
|
+
## License
|
|
241
|
+
|
|
242
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
243
|
+
|
|
244
|
+
## Contributing
|
|
245
|
+
|
|
246
|
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
247
|
+
|
|
248
|
+
## Acknowledgments
|
|
249
|
+
|
|
250
|
+
- [Hugging Face](https://huggingface.co/) - Free inference API
|
|
251
|
+
- [DuckDuckGo](https://duckduckgo.com/) - Privacy-focused search
|
|
252
|
+
- [Wikipedia](https://www.wikipedia.org/) - Free encyclopedia
|
|
253
|
+
- [Qwen](https://huggingface.co/Qwen) - Open source AI models
|
achem-1.0.0/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# ACHEM - Deep Web Research Tool
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
> **ACHEM** (Arabic: آشم) is a powerful deep web research tool that aggregates information from 30+ sources, scrapes full content from top results, and generates concise summaries using AI.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Deep Web Research**: Gathers results from 30+ sources via DuckDuckGo
|
|
10
|
+
- **Web Scraping**: Extracts full content from top 3 most relevant links
|
|
11
|
+
- **Two-Pass Search**: Prioritizes technical content (StackOverflow, GitHub, forums)
|
|
12
|
+
- **AI Summarization**: Uses Hugging Face Inference Providers (free tier)
|
|
13
|
+
- **Syntax Highlighting**: Color-coded output for easy scanning
|
|
14
|
+
- **SQLite Cache**: Instant recall for repeated searches
|
|
15
|
+
- **Export**: Save summaries to Markdown files
|
|
16
|
+
- **Multi-language**: Supports English, French, and Arabic
|
|
17
|
+
|
|
18
|
+
## Screenshots
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
╔══════════════════════════════════════════════════════════════════╗
|
|
22
|
+
║ ACHEM - Deep Web Research ║
|
|
23
|
+
╚══════════════════════════════════════════════════════════════════╝
|
|
24
|
+
|
|
25
|
+
🔍 Deep Research: how to learn python
|
|
26
|
+
==================================================
|
|
27
|
+
PASS 1: Gathering 30 sources from DuckDuckGo...
|
|
28
|
+
✓ Found 30 sources
|
|
29
|
+
PASS 2: Scraped full content from top 3 links
|
|
30
|
+
→ Analyzing 35 total sources...
|
|
31
|
+
→ Generating deep summary...
|
|
32
|
+
|
|
33
|
+
╭──────────────────────────────────────────────────────────────────╮
|
|
34
|
+
│ UNIFIED RESEARCH SUMMARY │
|
|
35
|
+
├──────────────────────────────────────────────────────────────────┤
|
|
36
|
+
│ 1. Start with the official Python tutorial: │
|
|
37
|
+
│ - Visit docs.python.org/3/tutorial │
|
|
38
|
+
│ │
|
|
39
|
+
│ 2. Use free online tutorials: │
|
|
40
|
+
│ - LearnPython.org, pythonbasics.org │
|
|
41
|
+
╰──────────────────────────────────────────────────────────────────╯
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
### Prerequisites
|
|
47
|
+
|
|
48
|
+
- Python 3.10 or higher
|
|
49
|
+
- pip package manager
|
|
50
|
+
|
|
51
|
+
### Quick Install (PyPI)
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install achem
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Or Install from Source
|
|
58
|
+
|
|
59
|
+
1. **Clone the repository**
|
|
60
|
+
```bash
|
|
61
|
+
git clone https://github.com/achem/achem.git
|
|
62
|
+
cd achem
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
2. **Install in editable mode**
|
|
66
|
+
```bash
|
|
67
|
+
pip install -e .
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
3. **Configure API keys**
|
|
71
|
+
```bash
|
|
72
|
+
cp .env.example .env
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Then edit `.env` and add your Hugging Face API token:
|
|
76
|
+
```env
|
|
77
|
+
HF_API_KEY=hf_your_token_here
|
|
78
|
+
HF_MODEL=Qwen/Qwen2.5-7B-Instruct
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Getting a Hugging Face API Token
|
|
82
|
+
|
|
83
|
+
1. Go to [Hugging Face](https://huggingface.co/)
|
|
84
|
+
2. Create an account (free)
|
|
85
|
+
3. Go to Settings → Access Tokens
|
|
86
|
+
4. Create a new token with "Read" permissions
|
|
87
|
+
5. Copy the token to your `.env` file
|
|
88
|
+
|
|
89
|
+
## Usage
|
|
90
|
+
|
|
91
|
+
### Interactive Mode
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
python src/main.py
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Command Line Mode
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
python src/main.py "your search query"
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Options
|
|
104
|
+
|
|
105
|
+
| Option | Description | Default |
|
|
106
|
+
|--------|-------------|---------|
|
|
107
|
+
| `-l, --limit` | Wikipedia results per query | 10 |
|
|
108
|
+
| `--lang` | Language (en/fr/ar/auto) | auto |
|
|
109
|
+
| `--ddg-limit` | DuckDuckGo results | 30 |
|
|
110
|
+
| `--min-relevance` | Minimum relevance % | 0 |
|
|
111
|
+
| `--no-cache` | Skip cache | False |
|
|
112
|
+
| `--no-wikipedia` | Skip Wikipedia | False |
|
|
113
|
+
| `--clear-cache` | Clear SQLite cache | False |
|
|
114
|
+
|
|
115
|
+
### Commands (Interactive Mode)
|
|
116
|
+
|
|
117
|
+
| Command | Description |
|
|
118
|
+
|---------|-------------|
|
|
119
|
+
| `clear` / `cls` | Clear screen |
|
|
120
|
+
| `export` / `save` | Export last summary |
|
|
121
|
+
| `help` / `?` | Show help |
|
|
122
|
+
| `version` / `v` | Show version |
|
|
123
|
+
| `exit` / `quit` / `q` | Exit program |
|
|
124
|
+
|
|
125
|
+
## Project Structure
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
ACHEM/
|
|
129
|
+
├── src/
|
|
130
|
+
│ └── achem/ # Main package
|
|
131
|
+
│ ├── __init__.py
|
|
132
|
+
│ ├── main.py # Entry point
|
|
133
|
+
│ ├── commands.py # Command handler
|
|
134
|
+
│ ├── config_manager.py # Config loader
|
|
135
|
+
│ ├── duckduckgo_client.py # DDG search
|
|
136
|
+
│ ├── export_manager.py # Export to Documents/ACHEM/
|
|
137
|
+
│ ├── huggingface_summarizer.py # AI summarization
|
|
138
|
+
│ ├── output_formatter.py # Terminal UI
|
|
139
|
+
│ ├── search_router.py # Source priority
|
|
140
|
+
│ ├── sqlite_cache.py # SQLite cache
|
|
141
|
+
│ ├── spell_checker.py # Typo correction
|
|
142
|
+
│ ├── text_analyzer.py # TF-IDF analysis
|
|
143
|
+
│ ├── user_input.py # Input handler
|
|
144
|
+
│ ├── web_scraper.py # BeautifulSoup scraper
|
|
145
|
+
│ └── wikipedia_client.py # Wikipedia API
|
|
146
|
+
├── .env.example # Config template
|
|
147
|
+
├── .gitignore
|
|
148
|
+
├── LICENSE
|
|
149
|
+
├── README.md
|
|
150
|
+
└── pyproject.toml # Package metadata
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## How It Works
|
|
154
|
+
|
|
155
|
+
### Two-Pass Search System
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
┌─────────────────────────────────────────────────────┐
|
|
159
|
+
│ PASS 1: DuckDuckGo Search (30 results) │
|
|
160
|
+
│ • Prioritizes technical sites │
|
|
161
|
+
│ • Filters out cookie/login/consent pages │
|
|
162
|
+
│ • Ranks by domain authority │
|
|
163
|
+
├─────────────────────────────────────────────────────┤
|
|
164
|
+
│ PASS 2: Web Scraping (Top 3) │
|
|
165
|
+
│ • BeautifulSoup extracts full article text │
|
|
166
|
+
│ • Removes navigation/footer/scripts │
|
|
167
|
+
│ • Combines up to 10,000 chars per article │
|
|
168
|
+
├─────────────────────────────────────────────────────┤
|
|
169
|
+
│ PASS 3: AI Summarization │
|
|
170
|
+
│ • Neutral technical prompt │
|
|
171
|
+
│ • No ethical warnings or opinions │
|
|
172
|
+
│ • 500-4000 character output │
|
|
173
|
+
│ • Syntax highlighting for steps/commands │
|
|
174
|
+
└─────────────────────────────────────────────────────┘
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Source Priority
|
|
178
|
+
|
|
179
|
+
1. **DuckDuckGo** (Primary) - Real-time web results
|
|
180
|
+
2. **Wikipedia** (Secondary) - Background concepts only
|
|
181
|
+
3. **Web Scraping** - Full content from top 3
|
|
182
|
+
|
|
183
|
+
## Export Location
|
|
184
|
+
|
|
185
|
+
Summaries are saved to:
|
|
186
|
+
- **Linux/macOS**: `~/Documents/ACHEM/`
|
|
187
|
+
- **Windows**: `C:\Users\<username>\Documents\ACHEM\`
|
|
188
|
+
|
|
189
|
+
## Disclaimer
|
|
190
|
+
|
|
191
|
+
**ACHEM is for educational and research purposes only.**
|
|
192
|
+
|
|
193
|
+
The tool aggregates publicly available information from the web. Any actions taken based on the information provided are the sole responsibility of the user. The developer is not responsible for any misuse of this tool.
|
|
194
|
+
|
|
195
|
+
## License
|
|
196
|
+
|
|
197
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
198
|
+
|
|
199
|
+
## Contributing
|
|
200
|
+
|
|
201
|
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
202
|
+
|
|
203
|
+
## Acknowledgments
|
|
204
|
+
|
|
205
|
+
- [Hugging Face](https://huggingface.co/) - Free inference API
|
|
206
|
+
- [DuckDuckGo](https://duckduckgo.com/) - Privacy-focused search
|
|
207
|
+
- [Wikipedia](https://www.wikipedia.org/) - Free encyclopedia
|
|
208
|
+
- [Qwen](https://huggingface.co/Qwen) - Open source AI models
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "achem"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Deep Web Research Tool - Aggregates 30+ sources, scrapes content, generates AI summaries"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "ACHEM Contributors"}
|
|
13
|
+
]
|
|
14
|
+
keywords = [
|
|
15
|
+
"research",
|
|
16
|
+
"deep-web",
|
|
17
|
+
"web-scraping",
|
|
18
|
+
"summarization",
|
|
19
|
+
"ai",
|
|
20
|
+
"cli",
|
|
21
|
+
"tool"
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 5 - Production/Stable",
|
|
25
|
+
"Environment :: Console",
|
|
26
|
+
"Intended Audience :: Developers",
|
|
27
|
+
"Intended Audience :: Education",
|
|
28
|
+
"Intended Audience :: Science/Research",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3.10",
|
|
32
|
+
"Programming Language :: Python :: 3.11",
|
|
33
|
+
"Programming Language :: Python :: 3.12",
|
|
34
|
+
"Programming Language :: Python :: 3.13",
|
|
35
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
36
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
37
|
+
"Topic :: Text Processing :: General",
|
|
38
|
+
]
|
|
39
|
+
requires-python = ">=3.10"
|
|
40
|
+
dependencies = [
|
|
41
|
+
"wikipedia-api>=0.5.4",
|
|
42
|
+
"rich>=13.0.0",
|
|
43
|
+
"psutil>=5.9.0",
|
|
44
|
+
"prompt_toolkit>=3.0.0",
|
|
45
|
+
"pyfiglet>=0.8.0",
|
|
46
|
+
"openai>=1.0.0",
|
|
47
|
+
"ddgs>=3.0.0",
|
|
48
|
+
"beautifulsoup4>=4.12.0",
|
|
49
|
+
"requests>=2.31.0",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[project.optional-dependencies]
|
|
53
|
+
arabic = [
|
|
54
|
+
"arabic-reshaper>=3.0.0",
|
|
55
|
+
"python-bidi>=0.14.0",
|
|
56
|
+
]
|
|
57
|
+
dev = [
|
|
58
|
+
"pytest>=7.0.0",
|
|
59
|
+
"ruff>=0.1.0",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
[project.scripts]
|
|
63
|
+
achem = "achem.main:main"
|
|
64
|
+
|
|
65
|
+
[project.urls]
|
|
66
|
+
Homepage = "https://github.com/achem/achem"
|
|
67
|
+
Documentation = "https://github.com/achem/achem#readme"
|
|
68
|
+
Repository = "https://github.com/achem/achem"
|
|
69
|
+
Issues = "https://github.com/achem/achem/issues"
|
|
70
|
+
|
|
71
|
+
[tool.setuptools.packages.find]
|
|
72
|
+
where = ["src"]
|
|
73
|
+
|
|
74
|
+
[tool.setuptools.package-data]
|
|
75
|
+
achem = ["py.typed"]
|
achem-1.0.0/setup.cfg
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
import hashlib
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CacheManager:
|
|
10
|
+
def __init__(self, cache_dir: str = None, ttl_seconds: int = 86400):
|
|
11
|
+
if cache_dir is None:
|
|
12
|
+
cache_dir = os.path.join(
|
|
13
|
+
os.path.expanduser("~"), ".wiki-summarizer", "cache"
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
self.cache_dir = Path(cache_dir)
|
|
17
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
self.ttl_seconds = ttl_seconds
|
|
19
|
+
|
|
20
|
+
def _get_cache_key(self, query: str) -> str:
|
|
21
|
+
"""Generate a unique cache key for a query."""
|
|
22
|
+
query_normalized = query.lower().strip()
|
|
23
|
+
return hashlib.md5(query_normalized.encode()).hexdigest()
|
|
24
|
+
|
|
25
|
+
def _get_cache_path(self, cache_key: str) -> Path:
|
|
26
|
+
"""Get the file path for a cache key."""
|
|
27
|
+
return self.cache_dir / f"{cache_key}.json"
|
|
28
|
+
|
|
29
|
+
def get(self, query: str) -> Optional[dict]:
|
|
30
|
+
"""Retrieve cached data for a query."""
|
|
31
|
+
cache_key = self._get_cache_key(query)
|
|
32
|
+
cache_path = self._get_cache_path(cache_key)
|
|
33
|
+
|
|
34
|
+
if not cache_path.exists():
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
with open(cache_path, "r", encoding="utf-8") as f:
|
|
39
|
+
cache_data = json.load(f)
|
|
40
|
+
|
|
41
|
+
cached_time = cache_data.get("timestamp", 0)
|
|
42
|
+
if time.time() - cached_time > self.ttl_seconds:
|
|
43
|
+
cache_path.unlink()
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return cache_data.get("data")
|
|
47
|
+
except (json.JSONDecodeError, IOError):
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
def set(self, query: str, data: dict) -> None:
|
|
51
|
+
"""Store data in cache for a query."""
|
|
52
|
+
cache_key = self._get_cache_key(query)
|
|
53
|
+
cache_path = self._get_cache_path(cache_key)
|
|
54
|
+
|
|
55
|
+
cache_data = {"query": query, "timestamp": time.time(), "data": data}
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
with open(cache_path, "w", encoding="utf-8") as f:
|
|
59
|
+
json.dump(cache_data, f, ensure_ascii=False, indent=2)
|
|
60
|
+
except IOError:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def invalidate(self, query: str = None) -> None:
|
|
64
|
+
"""Invalidate cache for a specific query or all queries."""
|
|
65
|
+
if query:
|
|
66
|
+
cache_key = self._get_cache_key(query)
|
|
67
|
+
cache_path = self._get_cache_path(cache_key)
|
|
68
|
+
if cache_path.exists():
|
|
69
|
+
cache_path.unlink()
|
|
70
|
+
else:
|
|
71
|
+
for cache_file in self.cache_dir.glob("*.json"):
|
|
72
|
+
cache_file.unlink()
|
|
73
|
+
|
|
74
|
+
def get_stats(self) -> dict:
|
|
75
|
+
"""Get cache statistics."""
|
|
76
|
+
total_size = 0
|
|
77
|
+
file_count = 0
|
|
78
|
+
expired_count = 0
|
|
79
|
+
current_time = time.time()
|
|
80
|
+
|
|
81
|
+
for cache_file in self.cache_dir.glob("*.json"):
|
|
82
|
+
file_count += 1
|
|
83
|
+
total_size += cache_file.stat().st_size
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
with open(cache_file, "r", encoding="utf-8") as f:
|
|
87
|
+
cache_data = json.load(f)
|
|
88
|
+
cached_time = cache_data.get("timestamp", 0)
|
|
89
|
+
if current_time - cached_time > self.ttl_seconds:
|
|
90
|
+
expired_count += 1
|
|
91
|
+
except (json.JSONDecodeError, IOError):
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
"total_files": file_count,
|
|
96
|
+
"total_size_mb": round(total_size / (1024 * 1024), 2),
|
|
97
|
+
"expired_files": expired_count,
|
|
98
|
+
}
|