janito 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- janito/__init__.py +1 -1
- janito/cli/agent/__init__.py +7 -0
- janito/cli/agent/conversation.py +149 -0
- janito/cli/agent/initialization.py +172 -0
- janito/cli/agent/query.py +108 -0
- janito/cli/agent.py +7 -395
- janito/cli/app.py +98 -10
- janito/cli/commands/__init__.py +12 -0
- janito/cli/commands/config.py +242 -0
- janito/cli/commands/history.py +119 -0
- janito/cli/commands/profile.py +72 -0
- janito/cli/commands/validation.py +24 -0
- janito/cli/commands/workspace.py +31 -0
- janito/cli/commands.py +9 -326
- janito/config.py +17 -0
- janito/data/instructions_template.txt +7 -4
- janito/tools/__init__.py +8 -2
- janito/tools/fetch_webpage/__init__.py +22 -33
- janito/tools/fetch_webpage/core.py +182 -155
- janito/tools/search_text.py +225 -239
- janito/tools/think.py +37 -0
- janito/tools/usage_tracker.py +1 -0
- {janito-0.13.0.dist-info → janito-0.14.0.dist-info}/METADATA +104 -8
- {janito-0.13.0.dist-info → janito-0.14.0.dist-info}/RECORD +28 -22
- janito/test_file.py +0 -4
- janito/tools/fetch_webpage/chunking.py +0 -76
- janito/tools/fetch_webpage/extractors.py +0 -276
- janito/tools/fetch_webpage/news.py +0 -137
- janito/tools/fetch_webpage/utils.py +0 -108
- {janito-0.13.0.dist-info → janito-0.14.0.dist-info}/WHEEL +0 -0
- {janito-0.13.0.dist-info → janito-0.14.0.dist-info}/entry_points.txt +0 -0
- {janito-0.13.0.dist-info → janito-0.14.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: janito
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.14.0
|
4
4
|
Summary: Janito CLI tool
|
5
5
|
Project-URL: Homepage, https://github.com/joaompinto/janito
|
6
6
|
Author-email: João Pinto <lamego.pinto@gmail.com>
|
@@ -31,12 +31,13 @@ Janito is a powerful AI-assisted command-line interface (CLI) tool built with Py
|
|
31
31
|
- 🔍 Smart code search and editing
|
32
32
|
- 💻 Interactive terminal interface with rich formatting
|
33
33
|
- 📊 Detailed token usage tracking and cost reporting with cache savings analysis
|
34
|
-
- 🛑 Token and tool usage reporting even when interrupted with Ctrl+C
|
35
34
|
- 🌐 Web page fetching with content extraction capabilities
|
36
35
|
- 🔄 Parameter profiles for optimizing Claude's behavior for different tasks
|
37
36
|
- 📋 Line delta tracking to monitor net changes in files
|
38
|
-
- 💬
|
37
|
+
- 💬 Enhanced conversation history with browsing and management
|
39
38
|
- 🔇 Trust mode for concise output without tool details
|
39
|
+
- 🚫 No-tools mode for pure AI interactions without file system access
|
40
|
+
- 📝 Custom system instructions for specialized assistant behavior
|
40
41
|
|
41
42
|
## 🛠️ System Requirements
|
42
43
|
|
@@ -115,19 +116,35 @@ janito --trust "Optimize the HTML code"
|
|
115
116
|
# Or use the short alias
|
116
117
|
janito -t "Optimize the HTML code"
|
117
118
|
|
119
|
+
# Disable all tools for pure AI interaction
|
120
|
+
janito --no-tools "Explain how HTML works"
|
121
|
+
|
122
|
+
# View your conversation history
|
123
|
+
janito --history
|
124
|
+
|
125
|
+
# View a specific number of recent conversations
|
126
|
+
janito --history 10
|
127
|
+
|
118
128
|
# Continue the most recent conversation
|
119
129
|
janito --continue "Please add one more line"
|
120
130
|
|
121
131
|
# Continue a specific conversation using its message ID
|
122
132
|
# (Janito displays the message ID after each conversation)
|
123
|
-
janito --continue abc123def
|
133
|
+
janito --continue 'abc123def' 'Let's refine that code'
|
134
|
+
|
135
|
+
# Alternative way to continue a specific conversation
|
136
|
+
janito --continue-id abc123def "Let's refine that code"
|
137
|
+
|
138
|
+
# Provide custom system instructions
|
139
|
+
janito --system "You are a poetry expert who speaks in rhymes" "Write about coding"
|
140
|
+
# Or use the short alias
|
141
|
+
janito -s "You are a poetry expert who speaks in rhymes" "Write about coding"
|
124
142
|
|
125
143
|
# Show current configuration and available profiles
|
126
144
|
janito --show-config
|
127
145
|
|
128
146
|
# You can press Ctrl+C at any time to interrupt a query
|
129
|
-
#
|
130
|
-
# Even interrupted conversations can be continued with --continue
|
147
|
+
# Interrupted conversations can be continued with --continue
|
131
148
|
```
|
132
149
|
|
133
150
|
## 🔧 Available Tools
|
@@ -213,9 +230,58 @@ This feature is particularly useful for:
|
|
213
230
|
- Focusing on results rather than the process
|
214
231
|
- Creating cleaner output for documentation or sharing
|
215
232
|
|
233
|
+
## 🚫 No-Tools Mode
|
234
|
+
|
235
|
+
Janito provides a no-tools mode that disables all file system and external tools for pure AI interactions:
|
236
|
+
|
237
|
+
### How It Works
|
238
|
+
|
239
|
+
- When enabled with `--no-tools`, Janito disables all tools for the current session
|
240
|
+
- Claude will respond based purely on its knowledge without accessing or modifying files
|
241
|
+
- This mode is a per-session setting and not saved to your configuration
|
242
|
+
|
243
|
+
### Using No-Tools Mode
|
244
|
+
|
245
|
+
```bash
|
246
|
+
# Enable no-tools mode
|
247
|
+
janito --no-tools "Explain how Docker containers work"
|
248
|
+
```
|
249
|
+
|
250
|
+
This feature is particularly useful for:
|
251
|
+
- Getting general information or explanations without file system access
|
252
|
+
- Brainstorming sessions where you don't need file operations
|
253
|
+
- Safer operation in sensitive environments
|
254
|
+
- Faster responses for queries that don't require tools
|
255
|
+
|
256
|
+
## 📝 Custom System Instructions
|
257
|
+
|
258
|
+
Janito allows you to provide custom system instructions to change Claude's behavior:
|
259
|
+
|
260
|
+
### How It Works
|
261
|
+
|
262
|
+
- When provided with `--system` or `-s`, Janito uses your custom instructions instead of the default
|
263
|
+
- This allows you to create specialized assistant personalities or behaviors
|
264
|
+
- Custom instructions are a per-session setting and not saved to your configuration
|
265
|
+
|
266
|
+
### Using Custom System Instructions
|
267
|
+
|
268
|
+
```bash
|
269
|
+
# Provide custom system instructions
|
270
|
+
janito --system "You are a poetry expert who speaks in rhymes" "Write about coding"
|
271
|
+
|
272
|
+
# Or use the short alias
|
273
|
+
janito -s "You are a cybersecurity expert" "Review this authentication code"
|
274
|
+
```
|
275
|
+
|
276
|
+
This feature is particularly useful for:
|
277
|
+
- Creating specialized assistant personalities
|
278
|
+
- Focusing Claude on specific domains or expertise
|
279
|
+
- Setting up specific response formats or styles
|
280
|
+
- Educational scenarios where you need different expert perspectives
|
281
|
+
|
216
282
|
## 💬 Conversation History
|
217
283
|
|
218
|
-
Janito automatically saves your conversation history, allowing you to continue previous discussions:
|
284
|
+
Janito automatically saves your conversation history, allowing you to browse, manage, and continue previous discussions:
|
219
285
|
|
220
286
|
### How It Works
|
221
287
|
|
@@ -223,6 +289,23 @@ Janito automatically saves your conversation history, allowing you to continue p
|
|
223
289
|
- The most recent conversation is also saved as `.janito/last_message.json` for backward compatibility
|
224
290
|
- After each conversation, Janito displays the command to continue that specific conversation
|
225
291
|
|
292
|
+
### Browsing Your History
|
293
|
+
|
294
|
+
You can view your conversation history with the `--history` flag:
|
295
|
+
|
296
|
+
```bash
|
297
|
+
# Show the 20 most recent conversations (default)
|
298
|
+
janito --history
|
299
|
+
|
300
|
+
# Show a specific number of recent conversations
|
301
|
+
janito --history 10
|
302
|
+
```
|
303
|
+
|
304
|
+
This displays a table with:
|
305
|
+
- Conversation ID
|
306
|
+
- Date and time
|
307
|
+
- First query from each conversation
|
308
|
+
|
226
309
|
### Using the Continue Feature
|
227
310
|
|
228
311
|
```bash
|
@@ -231,6 +314,13 @@ janito --continue "Add more details to your previous response"
|
|
231
314
|
|
232
315
|
# Continue a specific conversation using its ID
|
233
316
|
janito --continue abc123def "Let's modify that code you suggested"
|
317
|
+
|
318
|
+
# Just use --continue without arguments to continue the most recent conversation
|
319
|
+
# and be prompted for your next query
|
320
|
+
janito --continue
|
321
|
+
|
322
|
+
# Alternative way to continue a specific conversation
|
323
|
+
janito --continue-id abc123def "Let's modify that code you suggested"
|
234
324
|
```
|
235
325
|
|
236
326
|
The `--continue` flag (or `-c` for short) allows you to:
|
@@ -266,11 +356,17 @@ Janito offers a variety of command-line options to customize its behavior:
|
|
266
356
|
--set-api-key TEXT Set the Anthropic API key globally in the user's home directory
|
267
357
|
--ask Enable ask mode which disables tools that perform changes
|
268
358
|
--trust, -t Enable trust mode which suppresses tool outputs for concise execution
|
359
|
+
--no-tools Disable all tools for this session (pure AI interaction)
|
269
360
|
--temperature FLOAT Set the temperature for model generation (0.0 to 1.0)
|
270
361
|
--profile TEXT Use a predefined parameter profile (precise, balanced, conversational, creative, technical)
|
271
362
|
--role TEXT Set the assistant's role (default: 'software engineer')
|
363
|
+
--system, -s TEXT Provide custom system instructions, bypassing the default file load method
|
272
364
|
--version Show the version and exit
|
273
|
-
--continue, -c TEXT Continue a
|
365
|
+
--continue, -c TEXT Continue a conversation. Can be used as: 1) --continue (to continue most recent),
|
366
|
+
2) --continue 123 (to continue conversation with ID 123), or
|
367
|
+
3) --continue "query" (to continue most recent with new query)
|
368
|
+
--continue-id TEXT Continue a specific conversation with the given ID
|
369
|
+
--history Show a summary of conversations. Use --history for default (20) or --history n to specify count
|
274
370
|
--help Show the help message and exit
|
275
371
|
```
|
276
372
|
|
@@ -1,17 +1,26 @@
|
|
1
|
-
janito/__init__.py,sha256=
|
1
|
+
janito/__init__.py,sha256=7oWOSdYGAlxA75CpzMThFPrXL2oWwJw60hTnRBxIo1A,53
|
2
2
|
janito/__main__.py,sha256=Oy-Nc1tZkpyvTKuq1R8oHSuJTkvptN6H93kIHBu7DKY,107
|
3
3
|
janito/callbacks.py,sha256=E1FPXYHZUgiEGMabYuf999PSf_Su4ByHOWlc1-hMqWE,915
|
4
|
-
janito/config.py,sha256=
|
5
|
-
janito/test_file.py,sha256=c6GWGdTYG3z-Y5XBao9Tmhmq3G-v0L37OfwLgBo8zIU,126
|
4
|
+
janito/config.py,sha256=XZYVjHPTv41l0NyGHaeq3UKB5FyRNHSEuLui-_mv5BU,13821
|
6
5
|
janito/token_report.py,sha256=Mks7o2yTxPChgQyBJNoQ5eMmrhSgEM4LKCKi2tHJbVo,9580
|
7
6
|
janito/cli/__init__.py,sha256=dVi9l3E86YyukjxQ-XSUnMZkghnNasXex-X5XAOBiwk,85
|
8
|
-
janito/cli/agent.py,sha256=
|
9
|
-
janito/cli/app.py,sha256=
|
10
|
-
janito/cli/commands.py,sha256=
|
7
|
+
janito/cli/agent.py,sha256=gIkbQZHNcg8hH52ziqklhKY6EDSyl6URUbtRvQQHqyI,414
|
8
|
+
janito/cli/app.py,sha256=QkH14BhnFQ2RgavLwxMKa9m_7C8KGoSuSjRNS3iYR1o,9477
|
9
|
+
janito/cli/commands.py,sha256=M0DKCFZMS1mXA-s9Lm_4Ug7LJFlNg12fBdcDOhCnIWg,380
|
11
10
|
janito/cli/output.py,sha256=mo3hUokhrD4SWexUjCbLGGQeCDUf0369DA_i9BW7HjU,933
|
12
11
|
janito/cli/utils.py,sha256=gO4NtCNwtEzYDsQesrFlqB5FtYuw87yGwo4iG3nINgw,661
|
13
|
-
janito/
|
14
|
-
janito/
|
12
|
+
janito/cli/agent/__init__.py,sha256=3O3b_MnwX_-qApYlyXzXhdjT4-dHzfxpOAy1lK_ZAIE,250
|
13
|
+
janito/cli/agent/conversation.py,sha256=iif8ciIbsb_HZaZ6qvK40s0PyW3QmriZ1Q7PtUqKfI4,5644
|
14
|
+
janito/cli/agent/initialization.py,sha256=-7APIAk0gawVVrxd1AdeBwffEuudvy1bIhDE6LMXfw4,6583
|
15
|
+
janito/cli/agent/query.py,sha256=Sly1k-cVOydTPraUmdSPKxFYrScmCOd8BfAqFUvWwkg,4612
|
16
|
+
janito/cli/commands/__init__.py,sha256=4zGGB0x-TeBetSArBV38SZQo0T1K59vJofhXhLiD3g0,331
|
17
|
+
janito/cli/commands/config.py,sha256=U1Bnc_YSMwFkX-1dcksmdgsoH2_yrO8MSYTaESxR-sQ,9819
|
18
|
+
janito/cli/commands/history.py,sha256=WzWgPRZ7mQwlbDCPw3Hapb-K_Ybk1uKLs-qLams6sO4,5409
|
19
|
+
janito/cli/commands/profile.py,sha256=jsGKSqXlaSLE__Xjj9OKNTW68evU0APyK9QYr_OEELQ,2397
|
20
|
+
janito/cli/commands/validation.py,sha256=ldtfk0vAOkrto1Ys6G0MDmUOQvYalHsTd_l_0fMVHcI,718
|
21
|
+
janito/cli/commands/workspace.py,sha256=nSCrN3SSK4zq10x38uksaOMuzx___rnl-cmyVpvNlqk,909
|
22
|
+
janito/data/instructions_template.txt,sha256=Ewfjk_1qRO178WEcRQVONnff6CpRAeKGE8JAydD39Qw,2143
|
23
|
+
janito/tools/__init__.py,sha256=mCtmWVFfgrk_SFWcwzw92O9Fb9P-5SYnO2rPMxkT5rM,1624
|
15
24
|
janito/tools/decorators.py,sha256=Tp48n5y4LKsjyV3HeOA9wk2dV413RrEG-23kRyQVlKs,2522
|
16
25
|
janito/tools/delete_file.py,sha256=UrZ5q59SIxWfuJcqgol6yPBqL-RhO9lFCF4MqAc6o00,2252
|
17
26
|
janito/tools/find_files.py,sha256=c_N9ETcRPprQeuZYanwFnl-9E05ZqUYhNVoCRS5uqQg,8300
|
@@ -19,17 +28,14 @@ janito/tools/move_file.py,sha256=FCs1ghalfHlXmcbAA_IlLcUll9hTOU1MMFGrTWopXvM,274
|
|
19
28
|
janito/tools/prompt_user.py,sha256=OnTiWVBCbL_2MYu7oThlKr8X_pnYdG-dzxXSOgJF41c,1942
|
20
29
|
janito/tools/replace_file.py,sha256=i4GoLtS14eKSU5lYI18mJ96S0_ekeHMwlQfazg-fxrM,2296
|
21
30
|
janito/tools/rich_console.py,sha256=0zWYRF8qk4N-upuwswUSEfYFAfkEYDYeCgxst-czWtY,6044
|
22
|
-
janito/tools/search_text.py,sha256=
|
23
|
-
janito/tools/
|
31
|
+
janito/tools/search_text.py,sha256=GXc8fuqKD6r0dEr2lMs5LrYYMM5RtTApvMXlvdt2ZiY,8839
|
32
|
+
janito/tools/think.py,sha256=VsfYzYSfdBz6ayUYE8aoeAsdeN2PFpHjwUkxtaw1YYQ,1204
|
33
|
+
janito/tools/usage_tracker.py,sha256=3qHXIs9rzm0oY2iaaFHqlLre6ZoWicZgOc46s6XyGc0,4681
|
24
34
|
janito/tools/bash/bash.py,sha256=AFm0w_z-yyYRWxuR744OFpm5iCZaZpE-pWbnKbgajp4,3665
|
25
35
|
janito/tools/bash/unix_persistent_bash.py,sha256=I59PPQiXHscPJ6Y7ev_83dLFNFWq1hKwAK9kFXdnbBY,7185
|
26
36
|
janito/tools/bash/win_persistent_bash.py,sha256=96xm_yijjc6hBYfNluLahbvR2oUuHug_JkoMah7Hy38,12894
|
27
|
-
janito/tools/fetch_webpage/__init__.py,sha256=
|
28
|
-
janito/tools/fetch_webpage/
|
29
|
-
janito/tools/fetch_webpage/core.py,sha256=3XDvYnC_UbQUNumwWw32hfJhjJUEaPzzoF2yhxhwxos,7764
|
30
|
-
janito/tools/fetch_webpage/extractors.py,sha256=-jrLDRWfWyF2SNOATdRWrqETqnnDLbe2JUWtuyHKFOU,12043
|
31
|
-
janito/tools/fetch_webpage/news.py,sha256=Hp0uNTnRzTa-4hyegQbHSmLeSbkiSpx4cP2oP_hKLEQ,5378
|
32
|
-
janito/tools/fetch_webpage/utils.py,sha256=mkfwefD7U9HOktIwo1eP63v7dpVY-q06COUjaqnTT5M,3412
|
37
|
+
janito/tools/fetch_webpage/__init__.py,sha256=Rnzni75VX1JfmGJ2LHckGWQqr7xC95wahh5Q-1PlncQ,514
|
38
|
+
janito/tools/fetch_webpage/core.py,sha256=3Q8dAlWYMU_Mg8DyrNCBaLuriAxItSYIyGAcPZFktcI,7555
|
33
39
|
janito/tools/str_replace_editor/__init__.py,sha256=kYmscmQgft3Jzt3oCNz7k2FiRbJvku6OFDDC3Q_zoAA,144
|
34
40
|
janito/tools/str_replace_editor/editor.py,sha256=BckYfiMRUYDfDrbu871qMt2AfZexth_02QhwYYOd53g,2489
|
35
41
|
janito/tools/str_replace_editor/utils.py,sha256=akiPqCHjky_RwL9OitHJJ7uQ-3fNaA8wt_K_YO1EP6I,954
|
@@ -39,9 +45,9 @@ janito/tools/str_replace_editor/handlers/insert.py,sha256=eKHodm2ozKUlRMxWMLAsu9
|
|
39
45
|
janito/tools/str_replace_editor/handlers/str_replace.py,sha256=RciLTlA7R2PGljeeyluLcBHUAje9c1OCxm-bFE7j6iY,4473
|
40
46
|
janito/tools/str_replace_editor/handlers/undo.py,sha256=3OIdAWkpXC2iDe94_sfx_WxEFh3a1cRzoP0NtPXq1Ks,2491
|
41
47
|
janito/tools/str_replace_editor/handlers/view.py,sha256=k8F-n64bomHmDjavr5OJKC4cHhfm4_1-aMIFZdMICQo,6809
|
42
|
-
janito/data/instructions_template.txt,sha256=
|
43
|
-
janito-0.
|
44
|
-
janito-0.
|
45
|
-
janito-0.
|
46
|
-
janito-0.
|
47
|
-
janito-0.
|
48
|
+
janito/data/instructions_template.txt,sha256=Ewfjk_1qRO178WEcRQVONnff6CpRAeKGE8JAydD39Qw,2143
|
49
|
+
janito-0.14.0.dist-info/METADATA,sha256=VzEC8lrwVeAhFxZDGripkjFcCm8CoHAjJFAh_OchFo0,14074
|
50
|
+
janito-0.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
51
|
+
janito-0.14.0.dist-info/entry_points.txt,sha256=JMbF_1jg-xQddidpAYkzjOKdw70fy_ymJfcmerY2wIY,47
|
52
|
+
janito-0.14.0.dist-info/licenses/LICENSE,sha256=6-H8LXExbBIAuT4cyiE-Qy8Bad1K4pagQRVTWr6wkhk,1096
|
53
|
+
janito-0.14.0.dist-info/RECORD,,
|
janito/test_file.py
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Functions for chunking large content into manageable pieces.
|
3
|
-
"""
|
4
|
-
|
5
|
-
from typing import List
|
6
|
-
from janito.tools.rich_console import print_info, print_success
|
7
|
-
|
8
|
-
|
9
|
-
def chunk_large_content(text: str, chunk_size: int = 4000, overlap: int = 500) -> List[str]:
|
10
|
-
"""
|
11
|
-
Split very large text content into manageable chunks suitable for LLM processing.
|
12
|
-
|
13
|
-
Args:
|
14
|
-
text: The text to chunk
|
15
|
-
chunk_size: Target size for each chunk in characters
|
16
|
-
overlap: Number of characters to overlap between chunks
|
17
|
-
|
18
|
-
Returns:
|
19
|
-
List of text chunks
|
20
|
-
"""
|
21
|
-
if not text or len(text) <= chunk_size:
|
22
|
-
return [text] if text else []
|
23
|
-
|
24
|
-
print_info(f"Chunking {len(text)} characters of text into ~{chunk_size} character chunks", "Content Chunking")
|
25
|
-
|
26
|
-
# Try to split on paragraph breaks first
|
27
|
-
paragraphs = text.split('\n\n')
|
28
|
-
chunks = []
|
29
|
-
current_chunk = ""
|
30
|
-
|
31
|
-
for para in paragraphs:
|
32
|
-
# If adding this paragraph would exceed chunk size
|
33
|
-
if len(current_chunk) + len(para) + 2 > chunk_size:
|
34
|
-
# If current chunk is not empty, add it to chunks
|
35
|
-
if current_chunk:
|
36
|
-
chunks.append(current_chunk)
|
37
|
-
# Start new chunk with overlap from previous chunk
|
38
|
-
if overlap > 0 and len(current_chunk) > overlap:
|
39
|
-
current_chunk = current_chunk[-overlap:] + "\n\n" + para
|
40
|
-
else:
|
41
|
-
current_chunk = para
|
42
|
-
else:
|
43
|
-
# If paragraph itself is bigger than chunk size, split it
|
44
|
-
if len(para) > chunk_size:
|
45
|
-
words = para.split()
|
46
|
-
temp_chunk = ""
|
47
|
-
for word in words:
|
48
|
-
if len(temp_chunk) + len(word) + 1 > chunk_size:
|
49
|
-
chunks.append(temp_chunk)
|
50
|
-
# Start new chunk with overlap
|
51
|
-
if overlap > 0 and len(temp_chunk) > overlap:
|
52
|
-
temp_chunk = temp_chunk[-overlap:] + " " + word
|
53
|
-
else:
|
54
|
-
temp_chunk = word
|
55
|
-
else:
|
56
|
-
if temp_chunk:
|
57
|
-
temp_chunk += " " + word
|
58
|
-
else:
|
59
|
-
temp_chunk = word
|
60
|
-
if temp_chunk:
|
61
|
-
current_chunk = temp_chunk
|
62
|
-
else:
|
63
|
-
chunks.append(para)
|
64
|
-
else:
|
65
|
-
# Add paragraph to current chunk
|
66
|
-
if current_chunk:
|
67
|
-
current_chunk += "\n\n" + para
|
68
|
-
else:
|
69
|
-
current_chunk = para
|
70
|
-
|
71
|
-
# Don't forget the last chunk
|
72
|
-
if current_chunk:
|
73
|
-
chunks.append(current_chunk)
|
74
|
-
|
75
|
-
print_success(f"Text chunked into {len(chunks)} segments", "Content Chunking")
|
76
|
-
return chunks
|
@@ -1,276 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Content extraction methods for web pages.
|
3
|
-
"""
|
4
|
-
|
5
|
-
from typing import List, Dict, Union, Optional
|
6
|
-
from bs4 import BeautifulSoup
|
7
|
-
import trafilatura
|
8
|
-
from newspaper import Article
|
9
|
-
import re
|
10
|
-
|
11
|
-
from janito.tools.rich_console import print_info, print_success, print_warning
|
12
|
-
from janito.tools.fetch_webpage.utils import clean_text, similar_text
|
13
|
-
|
14
|
-
|
15
|
-
def extract_clean_text(html_content: str, method: str = 'trafilatura',
|
16
|
-
url: Optional[str] = None, target_strings: List[str] = None) -> str:
|
17
|
-
"""
|
18
|
-
Extract clean, relevant text from HTML content using various methods.
|
19
|
-
|
20
|
-
Args:
|
21
|
-
html_content: The HTML content to extract text from
|
22
|
-
method: The extraction method to use ('trafilatura', 'newspaper', 'beautifulsoup', 'all')
|
23
|
-
url: Optional URL for methods that require it (like newspaper)
|
24
|
-
target_strings: Optional list of strings to target specific content sections
|
25
|
-
|
26
|
-
Returns:
|
27
|
-
Extracted text content
|
28
|
-
"""
|
29
|
-
print_info(f"Extracting content using method: {method}", "Content Extraction")
|
30
|
-
|
31
|
-
extracted_text = ""
|
32
|
-
|
33
|
-
if method == 'trafilatura' or method == 'all':
|
34
|
-
try:
|
35
|
-
traf_text = trafilatura.extract(html_content, include_links=False,
|
36
|
-
include_tables=False, include_images=False,
|
37
|
-
favor_precision=True)
|
38
|
-
if traf_text and len(traf_text) > 100:
|
39
|
-
if method == 'trafilatura':
|
40
|
-
print_success("Successfully extracted content with Trafilatura", "Content Extraction")
|
41
|
-
return clean_text(traf_text)
|
42
|
-
extracted_text = traf_text
|
43
|
-
print_success("Successfully extracted content with Trafilatura", "Content Extraction")
|
44
|
-
except Exception as e:
|
45
|
-
print_warning(f"Content Extraction: Trafilatura extraction error: {str(e)}")
|
46
|
-
|
47
|
-
if method == 'newspaper' or method == 'all':
|
48
|
-
if not url:
|
49
|
-
print_warning("Content Extraction: URL required for newspaper extraction but not provided")
|
50
|
-
else:
|
51
|
-
try:
|
52
|
-
article = Article(url)
|
53
|
-
article.download(html_content)
|
54
|
-
article.parse()
|
55
|
-
np_text = article.text
|
56
|
-
if np_text and len(np_text) > 100:
|
57
|
-
if method == 'newspaper':
|
58
|
-
print_success("Successfully extracted content with Newspaper3k", "Content Extraction")
|
59
|
-
return clean_text(np_text)
|
60
|
-
if not extracted_text or len(np_text) > len(extracted_text):
|
61
|
-
extracted_text = np_text
|
62
|
-
print_success("Successfully extracted content with Newspaper3k", "Content Extraction")
|
63
|
-
except Exception as e:
|
64
|
-
print_warning(f"Content Extraction: Newspaper extraction error: {str(e)}")
|
65
|
-
|
66
|
-
if method == 'beautifulsoup' or method == 'all':
|
67
|
-
try:
|
68
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
69
|
-
|
70
|
-
# Remove script, style, and other non-content elements
|
71
|
-
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
|
72
|
-
element.decompose()
|
73
|
-
|
74
|
-
# Extract text from paragraph and heading tags
|
75
|
-
paragraphs = []
|
76
|
-
for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'article']):
|
77
|
-
text = tag.get_text(strip=True)
|
78
|
-
if text and len(text) > 20: # Skip very short pieces that might be UI elements
|
79
|
-
paragraphs.append(text)
|
80
|
-
|
81
|
-
bs_text = "\n\n".join(paragraphs)
|
82
|
-
if bs_text and len(bs_text) > 100:
|
83
|
-
if method == 'beautifulsoup':
|
84
|
-
print_success("Successfully extracted content with BeautifulSoup", "Content Extraction")
|
85
|
-
return clean_text(bs_text)
|
86
|
-
if not extracted_text or len(bs_text) > len(extracted_text):
|
87
|
-
extracted_text = bs_text
|
88
|
-
print_success("Successfully extracted content with BeautifulSoup", "Content Extraction")
|
89
|
-
except Exception as e:
|
90
|
-
print_warning(f"Content Extraction: BeautifulSoup extraction error: {str(e)}")
|
91
|
-
|
92
|
-
if not extracted_text:
|
93
|
-
print_warning("Content Extraction: Could not extract meaningful content with any method")
|
94
|
-
# Fall back to the raw text with HTML tags removed
|
95
|
-
extracted_text = BeautifulSoup(html_content, 'html.parser').get_text(separator='\n')
|
96
|
-
|
97
|
-
return clean_text(extracted_text)
|
98
|
-
|
99
|
-
|
100
|
-
def extract_targeted_content(html_content: str, target_strings: List[str],
|
101
|
-
context_size: int = 500) -> str:
|
102
|
-
"""
|
103
|
-
Extract content sections that contain specific target strings.
|
104
|
-
|
105
|
-
Args:
|
106
|
-
html_content: The HTML content to search within
|
107
|
-
target_strings: List of strings to search for in the content
|
108
|
-
context_size: Number of characters to include before and after each match
|
109
|
-
|
110
|
-
Returns:
|
111
|
-
Extracted content focusing on sections containing target strings
|
112
|
-
"""
|
113
|
-
if not target_strings:
|
114
|
-
return ""
|
115
|
-
|
116
|
-
print_info(f"Extracting content targeted around {len(target_strings)} search strings", "Targeted Extraction")
|
117
|
-
|
118
|
-
# First clean the HTML to make text extraction easier
|
119
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
120
|
-
|
121
|
-
# Remove script, style, and other non-content elements
|
122
|
-
for element in soup(['script', 'style', 'header', 'footer', 'nav']):
|
123
|
-
element.decompose()
|
124
|
-
|
125
|
-
# Get the full text content
|
126
|
-
full_text = soup.get_text(' ', strip=True)
|
127
|
-
full_text = re.sub(r'\s+', ' ', full_text) # Normalize whitespace
|
128
|
-
|
129
|
-
matched_sections = []
|
130
|
-
for target in target_strings:
|
131
|
-
if not target or len(target) < 3:
|
132
|
-
continue
|
133
|
-
|
134
|
-
# Try exact match first
|
135
|
-
if target in full_text:
|
136
|
-
indices = [m.start() for m in re.finditer(re.escape(target), full_text)]
|
137
|
-
for idx in indices:
|
138
|
-
start = max(0, idx - context_size)
|
139
|
-
end = min(len(full_text), idx + len(target) + context_size)
|
140
|
-
section = full_text[start:end]
|
141
|
-
# Add ellipsis if we're showing a fragment
|
142
|
-
if start > 0:
|
143
|
-
section = "..." + section
|
144
|
-
if end < len(full_text):
|
145
|
-
section = section + "..."
|
146
|
-
matched_sections.append(section)
|
147
|
-
else:
|
148
|
-
# Try fuzzy search if no exact match (looking for words in the target string)
|
149
|
-
words = [w for w in target.lower().split() if len(w) > 3]
|
150
|
-
if words:
|
151
|
-
for word in words:
|
152
|
-
pattern = r'\b' + re.escape(word) + r'\b'
|
153
|
-
matches = list(re.finditer(pattern, full_text.lower()))
|
154
|
-
for match in matches[:3]: # Limit to first 3 matches per word
|
155
|
-
idx = match.start()
|
156
|
-
start = max(0, idx - context_size)
|
157
|
-
end = min(len(full_text), idx + len(word) + context_size)
|
158
|
-
section = full_text[start:end]
|
159
|
-
if start > 0:
|
160
|
-
section = "..." + section
|
161
|
-
if end < len(full_text):
|
162
|
-
section = section + "..."
|
163
|
-
matched_sections.append(section)
|
164
|
-
|
165
|
-
# Deduplicate similar sections
|
166
|
-
unique_sections = []
|
167
|
-
for section in matched_sections:
|
168
|
-
if not any(similar_text(section, existing, threshold=0.7) for existing in unique_sections):
|
169
|
-
unique_sections.append(section)
|
170
|
-
|
171
|
-
if not unique_sections:
|
172
|
-
print_warning("Targeted Extraction: No content sections found matching the target strings")
|
173
|
-
return ""
|
174
|
-
|
175
|
-
# Join the sections with paragraph breaks
|
176
|
-
result = "\n\n".join(unique_sections)
|
177
|
-
print_success(f"Found {len(unique_sections)} relevant content sections", "Targeted Extraction")
|
178
|
-
|
179
|
-
return clean_text(result)
|
180
|
-
|
181
|
-
|
182
|
-
def extract_structured_content(html_content: str, url: str = None,
|
183
|
-
target_strings: List[str] = None) -> Dict[str, Union[str, List[str]]]:
|
184
|
-
"""
|
185
|
-
Extract structured content from a webpage, including title, main text, and key points.
|
186
|
-
|
187
|
-
Args:
|
188
|
-
html_content: The HTML content to extract from
|
189
|
-
url: Optional URL for methods that require it
|
190
|
-
target_strings: Optional list of strings to target specific content sections
|
191
|
-
|
192
|
-
Returns:
|
193
|
-
Dictionary with structured content elements
|
194
|
-
"""
|
195
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
196
|
-
|
197
|
-
# Extract title
|
198
|
-
title = ""
|
199
|
-
if soup.title:
|
200
|
-
title = soup.title.text.strip()
|
201
|
-
|
202
|
-
# Try to get more specific title from h1 if title looks generic
|
203
|
-
if not title or len(title) < 10:
|
204
|
-
h1_tags = soup.find_all('h1')
|
205
|
-
if h1_tags and len(h1_tags[0].text.strip()) > 10:
|
206
|
-
title = h1_tags[0].text.strip()
|
207
|
-
|
208
|
-
# Extract main content using trafilatura (our primary extractor)
|
209
|
-
main_text = extract_clean_text(html_content, method='trafilatura', url=url)
|
210
|
-
|
211
|
-
# If target strings are provided, prioritize content around those strings
|
212
|
-
targeted_text = ""
|
213
|
-
if target_strings:
|
214
|
-
targeted_text = extract_targeted_content(html_content, target_strings)
|
215
|
-
if targeted_text:
|
216
|
-
main_text = targeted_text
|
217
|
-
|
218
|
-
# Extract key points (using headers)
|
219
|
-
key_points = []
|
220
|
-
for header in soup.find_all(['h1', 'h2', 'h3']):
|
221
|
-
text = header.text.strip()
|
222
|
-
if text and len(text) > 5 and text not in key_points:
|
223
|
-
key_points.append(text)
|
224
|
-
|
225
|
-
# For news aggregators like Google News, look for news article titles specifically
|
226
|
-
if url and ('news.google.com' in url or 'news.yahoo.com' in url or 'msn.com/news' in url):
|
227
|
-
print_info("Detected news aggregator site, searching for article titles", "Content Extraction")
|
228
|
-
|
229
|
-
# Look for common news article title patterns
|
230
|
-
article_titles = []
|
231
|
-
|
232
|
-
# Google News specific article elements
|
233
|
-
articles = soup.find_all('article')
|
234
|
-
for article in articles[:20]: # Limit to first 20 articles
|
235
|
-
# Try to find the headline
|
236
|
-
headline = article.find(['h3', 'h4'])
|
237
|
-
if headline:
|
238
|
-
title = headline.text.strip()
|
239
|
-
if title and len(title) > 15 and title not in article_titles: # Skip short titles
|
240
|
-
article_titles.append(title)
|
241
|
-
|
242
|
-
# Add these to our key points
|
243
|
-
if article_titles:
|
244
|
-
key_points = article_titles + key_points
|
245
|
-
|
246
|
-
# Limit key points to most important ones
|
247
|
-
key_points = key_points[:15]
|
248
|
-
|
249
|
-
# Extract potential highlights (often in <strong>, <b>, <em> tags)
|
250
|
-
highlights = []
|
251
|
-
for tag in soup.find_all(['strong', 'b', 'em']):
|
252
|
-
text = tag.text.strip()
|
253
|
-
if text and len(text) > 15 and text not in highlights:
|
254
|
-
highlights.append(text)
|
255
|
-
|
256
|
-
# Limit highlights to most important ones
|
257
|
-
highlights = highlights[:5]
|
258
|
-
|
259
|
-
# Create a summary of the extracted content
|
260
|
-
summary = ""
|
261
|
-
if len(main_text) > 200:
|
262
|
-
# Extract first paragraph or two for summary
|
263
|
-
paragraphs = main_text.split('\n\n')
|
264
|
-
summary = '\n\n'.join(paragraphs[:2])
|
265
|
-
if len(summary) > 500:
|
266
|
-
summary = summary[:500] + "..."
|
267
|
-
|
268
|
-
return {
|
269
|
-
"title": title,
|
270
|
-
"main_text": main_text,
|
271
|
-
"key_points": key_points,
|
272
|
-
"highlights": highlights,
|
273
|
-
"summary": summary,
|
274
|
-
"word_count": len(main_text.split()),
|
275
|
-
"targeted_extraction": bool(target_strings and targeted_text)
|
276
|
-
}
|