ragdoll-cli 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +55 -20
- data/lib/ragdoll/cli/commands/search.rb +5 -0
- data/lib/ragdoll/cli/configuration_loader.rb +10 -2
- data/lib/ragdoll/cli/standalone_client.rb +7 -1
- data/lib/ragdoll/cli/version.rb +1 -1
- data/lib/ragdoll/cli.rb +50 -11
- metadata +8 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 95d02e9e2a6f2ed5278406deae5fb36a8b55aaa1fa9228e3ca4bc8ad602daaf8
|
|
4
|
+
data.tar.gz: 34e38890fb8829a213057bb4f3142ce81c620e1e2463b70b8b3d2d0692beae5a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5767875a5559078f341466140788711c61b45f9cd52bc77fd0844b43574a84c9462367abbe29c88d9c997df86f6e6996f116fbe2c4bb5201321f13d8ccac6eb5
|
|
7
|
+
data.tar.gz: 68d51ae3e7061e95a9a2f74f55d269afdc3af9bb377edd6edb5758eae7a49938a4f3032cb3c358a4e82d367b1a28ff26882183b6f7714fa4cf04eb556a988c8f
|
data/README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
> [!CAUTION]
|
|
2
|
+
> **Software Under Development by a Crazy Man**
|
|
3
|
+
>
|
|
4
|
+
> Evolved from multi-modal to unified text-based RAG architecture.
|
|
5
5
|
<br />
|
|
6
6
|
<div align="center">
|
|
7
7
|
<table>
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
</a>
|
|
13
13
|
</td>
|
|
14
14
|
<td width="50%" valign="top">
|
|
15
|
-
<p>
|
|
15
|
+
<p>Unified Text-Based RAG converts all media types—images, audio, documents—into comprehensive text representations before vectorization. This approach enables powerful cross-modal search where you can find images through AI-generated descriptions, audio through transcripts, and all content through a single, unified text-based search index. The system combines intelligent text conversion with retrieval-based methods and generative large language models for enhanced AI response generation.</p>
|
|
16
16
|
</td>
|
|
17
17
|
</tr>
|
|
18
18
|
</table>
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
|
|
21
21
|
# Ragdoll::CLI
|
|
22
22
|
|
|
23
|
-
Standalone command-line interface for the Ragdoll
|
|
23
|
+
Standalone command-line interface for the Ragdoll unified text-based RAG system. Converts all media types to searchable text and provides powerful cross-modal search capabilities through a simple CLI.
|
|
24
24
|
|
|
25
25
|
## Installation
|
|
26
26
|
|
|
@@ -42,9 +42,9 @@ This will install the `ragdoll` command-line tool.
|
|
|
42
42
|
export OPENAI_API_KEY=your_api_key_here
|
|
43
43
|
```
|
|
44
44
|
|
|
45
|
-
3. **
|
|
45
|
+
3. **Add documents:**
|
|
46
46
|
```bash
|
|
47
|
-
ragdoll
|
|
47
|
+
ragdoll add docs/*.pdf --recursive
|
|
48
48
|
```
|
|
49
49
|
|
|
50
50
|
4. **Search for content:**
|
|
@@ -77,21 +77,57 @@ ragdoll config path
|
|
|
77
77
|
ragdoll config database
|
|
78
78
|
```
|
|
79
79
|
|
|
80
|
-
### Document
|
|
80
|
+
### Document Management
|
|
81
81
|
|
|
82
82
|
```bash
|
|
83
|
-
#
|
|
84
|
-
ragdoll
|
|
83
|
+
# Add a single document
|
|
84
|
+
ragdoll add document.pdf
|
|
85
85
|
|
|
86
|
-
#
|
|
87
|
-
ragdoll
|
|
86
|
+
# Add multiple documents and directories
|
|
87
|
+
ragdoll add file1.pdf file2.txt ../docs
|
|
88
|
+
|
|
89
|
+
# Add files matching a pattern
|
|
90
|
+
ragdoll add "documents/*.pdf"
|
|
91
|
+
|
|
92
|
+
# Add recursively from directory (default: true)
|
|
93
|
+
ragdoll add "docs/" --recursive
|
|
88
94
|
|
|
89
95
|
# Filter by document type
|
|
90
|
-
ragdoll
|
|
96
|
+
ragdoll add "files/*" --type pdf
|
|
97
|
+
|
|
98
|
+
# Available types: pdf, docx, txt, md, html
|
|
99
|
+
|
|
100
|
+
# Skip confirmation prompts
|
|
101
|
+
ragdoll add docs/ --skip-confirmation
|
|
102
|
+
|
|
103
|
+
# Force addition of duplicate documents
|
|
104
|
+
ragdoll add document.pdf --force-duplicate
|
|
91
105
|
|
|
92
106
|
# Available types: pdf, docx, txt, md, html
|
|
93
107
|
```
|
|
94
108
|
|
|
109
|
+
#### Duplicate Detection
|
|
110
|
+
|
|
111
|
+
Ragdoll automatically detects and prevents duplicate documents from being processed:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Normal behavior - duplicates are detected and skipped
|
|
115
|
+
ragdoll add document.pdf
|
|
116
|
+
ragdoll add document.pdf # Skipped (duplicate detected)
|
|
117
|
+
|
|
118
|
+
# Force addition of duplicates when needed
|
|
119
|
+
ragdoll add document.pdf --force-duplicate # Creates new document despite duplicate
|
|
120
|
+
|
|
121
|
+
# Batch processing safely handles mixed new/duplicate files
|
|
122
|
+
ragdoll add docs/*.pdf # Only processes new files, skips duplicates
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**Duplicate Detection Features:**
|
|
126
|
+
- **File-based detection**: Compares file location, modification time, and SHA256 hash
|
|
127
|
+
- **Content-based detection**: Compares extracted text content and metadata
|
|
128
|
+
- **Smart similarity**: Detects duplicates even with minor differences (5% tolerance)
|
|
129
|
+
- **Performance optimized**: Uses database indexes for fast duplicate lookups
|
|
130
|
+
|
|
95
131
|
### Search
|
|
96
132
|
|
|
97
133
|
```bash
|
|
@@ -136,12 +172,9 @@ ragdoll search "backpropagation algorithm" --search-type fulltext
|
|
|
136
172
|
ragdoll search "transformer architecture" --search-type hybrid --semantic-weight 0.7 --text-weight 0.3
|
|
137
173
|
```
|
|
138
174
|
|
|
139
|
-
### Document
|
|
175
|
+
### Document Operations
|
|
140
176
|
|
|
141
177
|
```bash
|
|
142
|
-
# Add a single document
|
|
143
|
-
ragdoll add <path>
|
|
144
|
-
|
|
145
178
|
# List all documents
|
|
146
179
|
ragdoll list
|
|
147
180
|
|
|
@@ -155,6 +188,10 @@ ragdoll list --format plain
|
|
|
155
188
|
# Check document status
|
|
156
189
|
ragdoll status <id>
|
|
157
190
|
|
|
191
|
+
# Show detailed document information
|
|
192
|
+
ragdoll show <id>
|
|
193
|
+
ragdoll show <id> --format json
|
|
194
|
+
|
|
158
195
|
# Update document metadata
|
|
159
196
|
ragdoll update <id> --title "New Title"
|
|
160
197
|
|
|
@@ -164,8 +201,6 @@ ragdoll delete <id> --force # Bypass confirmation
|
|
|
164
201
|
|
|
165
202
|
# Show system statistics
|
|
166
203
|
ragdoll stats
|
|
167
|
-
ragdoll stats --format json
|
|
168
|
-
ragdoll stats --format plain
|
|
169
204
|
```
|
|
170
205
|
|
|
171
206
|
### Retrieval Utilities
|
|
@@ -10,6 +10,11 @@ module Ragdoll
|
|
|
10
10
|
|
|
11
11
|
puts "Searching for: #{query}"
|
|
12
12
|
puts "Search type: #{options[:search_type] || 'semantic'}"
|
|
13
|
+
|
|
14
|
+
# Show deprecation warning for content_type
|
|
15
|
+
if options[:content_type]
|
|
16
|
+
puts "⚠️ DEPRECATED: --content_type option is deprecated. Unified text-based system converts all media to searchable text."
|
|
17
|
+
end
|
|
13
18
|
|
|
14
19
|
# Show hybrid search weights if applicable
|
|
15
20
|
if options[:search_type] == 'hybrid'
|
|
@@ -22,7 +22,9 @@ module Ragdoll
|
|
|
22
22
|
|
|
23
23
|
default_config = {
|
|
24
24
|
'llm_provider' => 'openai',
|
|
25
|
-
'embedding_model' => 'text-embedding-3-
|
|
25
|
+
'embedding_model' => 'text-embedding-3-large',
|
|
26
|
+
'embedding_provider' => 'openai',
|
|
27
|
+
'use_unified_content' => true,
|
|
26
28
|
'chunk_size' => 1000,
|
|
27
29
|
'chunk_overlap' => 200,
|
|
28
30
|
'search_similarity_threshold' => 0.7,
|
|
@@ -72,7 +74,13 @@ module Ragdoll
|
|
|
72
74
|
ragdoll_config.llm_provider = config['llm_provider']&.to_sym || :openai
|
|
73
75
|
end
|
|
74
76
|
if ragdoll_config.respond_to?(:embedding_model=)
|
|
75
|
-
ragdoll_config.embedding_model = config['embedding_model'] || 'text-embedding-3-
|
|
77
|
+
ragdoll_config.embedding_model = config['embedding_model'] || 'text-embedding-3-large'
|
|
78
|
+
end
|
|
79
|
+
if ragdoll_config.respond_to?(:embedding_provider=)
|
|
80
|
+
ragdoll_config.embedding_provider = config['embedding_provider']&.to_sym || :openai
|
|
81
|
+
end
|
|
82
|
+
if ragdoll_config.respond_to?(:use_unified_content=)
|
|
83
|
+
ragdoll_config.use_unified_content = config['use_unified_content'] != false
|
|
76
84
|
end
|
|
77
85
|
|
|
78
86
|
# Processing settings
|
|
@@ -6,7 +6,13 @@ module Ragdoll
|
|
|
6
6
|
include DebugMe
|
|
7
7
|
|
|
8
8
|
def add_document(path, **options)
|
|
9
|
-
|
|
9
|
+
# Map force_duplicate option to force parameter for core library
|
|
10
|
+
core_options = options.dup
|
|
11
|
+
if core_options.key?(:force_duplicate)
|
|
12
|
+
core_options[:force] = core_options.delete(:force_duplicate)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
Ragdoll.add_document(path: path, **core_options)
|
|
10
16
|
end
|
|
11
17
|
|
|
12
18
|
|
data/lib/ragdoll/cli/version.rb
CHANGED
data/lib/ragdoll/cli.rb
CHANGED
|
@@ -42,7 +42,7 @@ module Ragdoll
|
|
|
42
42
|
method_option :threshold, type: :numeric,
|
|
43
43
|
desc: 'Similarity threshold (0.0-1.0, lower = more results)'
|
|
44
44
|
method_option :content_type, type: :string, aliases: '-c',
|
|
45
|
-
desc: '
|
|
45
|
+
desc: 'DEPRECATED: Content type filtering (unified text-based system converts all media to text)'
|
|
46
46
|
method_option :classification, type: :string, aliases: '-C',
|
|
47
47
|
desc: 'Filter by classification'
|
|
48
48
|
method_option :keywords, type: :string, aliases: '-k',
|
|
@@ -285,17 +285,22 @@ module Ragdoll
|
|
|
285
285
|
desc: 'Recursively process subdirectories (default: true)'
|
|
286
286
|
method_option :type, type: :string, aliases: '-t',
|
|
287
287
|
desc: 'Filter by document type (pdf, docx, txt, md, html)'
|
|
288
|
-
method_option :
|
|
288
|
+
method_option :skip_confirmation, type: :boolean, default: false, aliases: '-y',
|
|
289
289
|
desc: 'Skip confirmation prompts'
|
|
290
|
+
method_option :force_duplicate, type: :boolean, default: false, aliases: '-f',
|
|
291
|
+
desc: 'Force addition of duplicate documents (bypasses duplicate detection)'
|
|
290
292
|
def add(*paths)
|
|
291
293
|
if paths.empty?
|
|
292
294
|
puts 'Error: No paths provided'
|
|
293
|
-
puts 'Usage: ragdoll add PATH [PATH2] [PATH3]...'
|
|
295
|
+
puts 'Usage: ragdoll add PATH [PATH2] [PATH3]... [OPTIONS]'
|
|
294
296
|
puts 'Examples:'
|
|
295
297
|
puts ' ragdoll add file.pdf'
|
|
296
298
|
puts ' ragdoll add ../docs'
|
|
297
299
|
puts ' ragdoll add ../docs/**/*.md'
|
|
298
300
|
puts ' ragdoll add file1.txt file2.pdf ../docs'
|
|
301
|
+
puts ' ragdoll add file.pdf --force-duplicate # Force add even if duplicate'
|
|
302
|
+
puts ' ragdoll add ../docs --type=pdf # Only process PDF files'
|
|
303
|
+
puts ' ragdoll add ../docs --skip-confirmation # Skip prompts'
|
|
299
304
|
exit 1
|
|
300
305
|
end
|
|
301
306
|
|
|
@@ -338,12 +343,16 @@ module Ragdoll
|
|
|
338
343
|
|
|
339
344
|
progressbar.finish
|
|
340
345
|
|
|
341
|
-
# Summary
|
|
346
|
+
# Summary with duplicate detection information
|
|
342
347
|
success_count = all_results.count { |r| r && r[:status] == 'success' }
|
|
343
348
|
error_count = all_results.count { |r| r && r[:status] == 'error' }
|
|
349
|
+
duplicate_count = all_results.count { |r| r && r[:status] == 'success' && r[:duplicate] }
|
|
350
|
+
new_count = success_count - duplicate_count
|
|
344
351
|
|
|
345
352
|
puts "\nCompleted:"
|
|
346
|
-
puts " Successfully
|
|
353
|
+
puts " Successfully processed: #{success_count} files"
|
|
354
|
+
puts " New documents: #{new_count}"
|
|
355
|
+
puts " Duplicates #{options[:force_duplicate] ? 'forced' : 'detected'}: #{duplicate_count}" if duplicate_count > 0
|
|
347
356
|
puts " Errors: #{error_count} files"
|
|
348
357
|
|
|
349
358
|
if error_count > 0
|
|
@@ -355,10 +364,33 @@ module Ragdoll
|
|
|
355
364
|
|
|
356
365
|
return unless success_count > 0
|
|
357
366
|
|
|
358
|
-
|
|
359
|
-
all_results.select { |r| r && r[:status] == 'success'
|
|
360
|
-
|
|
361
|
-
puts "
|
|
367
|
+
# Show new documents
|
|
368
|
+
new_documents = all_results.select { |r| r && r[:status] == 'success' && !r[:duplicate] }
|
|
369
|
+
if new_documents.any?
|
|
370
|
+
puts "\nNew documents added:"
|
|
371
|
+
new_documents.each do |result|
|
|
372
|
+
puts " #{result[:file]} (ID: #{result[:document_id]})"
|
|
373
|
+
puts " #{result[:message]}" if result[:message]
|
|
374
|
+
end
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
# Show duplicate information
|
|
378
|
+
duplicate_documents = all_results.select { |r| r && r[:status] == 'success' && r[:duplicate] }
|
|
379
|
+
if duplicate_documents.any?
|
|
380
|
+
if options[:force_duplicate]
|
|
381
|
+
puts "\nDuplicates forced to be added:"
|
|
382
|
+
duplicate_documents.each do |result|
|
|
383
|
+
puts " #{result[:file]} (ID: #{result[:document_id]})"
|
|
384
|
+
puts " #{result[:message]}" if result[:message]
|
|
385
|
+
end
|
|
386
|
+
else
|
|
387
|
+
puts "\nDuplicates detected (skipped):"
|
|
388
|
+
duplicate_documents.each do |result|
|
|
389
|
+
puts " #{result[:file]} (existing ID: #{result[:document_id]})"
|
|
390
|
+
puts " #{result[:message]}" if result[:message]
|
|
391
|
+
end
|
|
392
|
+
puts "\nTip: Use --force-duplicate (-f) to force adding duplicates"
|
|
393
|
+
end
|
|
362
394
|
end
|
|
363
395
|
|
|
364
396
|
puts "\nNote: Documents are being processed in the background."
|
|
@@ -412,12 +444,19 @@ module Ragdoll
|
|
|
412
444
|
|
|
413
445
|
def process_single_file(client, path, options)
|
|
414
446
|
begin
|
|
415
|
-
|
|
447
|
+
# Pass force_duplicate parameter for duplicate detection
|
|
448
|
+
result = client.add_document(path, force_duplicate: options[:force_duplicate])
|
|
449
|
+
|
|
450
|
+
# Determine if this was a duplicate detection
|
|
451
|
+
duplicate_detected = result[:duplicate] || (result[:message] && result[:message].include?('already exists'))
|
|
452
|
+
|
|
416
453
|
{
|
|
417
454
|
file: path,
|
|
418
455
|
document_id: result[:document_id],
|
|
419
456
|
status: result[:success] ? 'success' : 'error',
|
|
420
|
-
message: result[:message]
|
|
457
|
+
message: result[:message],
|
|
458
|
+
duplicate: duplicate_detected,
|
|
459
|
+
forced: options[:force_duplicate]
|
|
421
460
|
}
|
|
422
461
|
rescue StandardError => e
|
|
423
462
|
{
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ragdoll-cli
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.12
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dewayne VanHoozer
|
|
@@ -15,14 +15,14 @@ dependencies:
|
|
|
15
15
|
requirements:
|
|
16
16
|
- - ">="
|
|
17
17
|
- !ruby/object:Gem::Version
|
|
18
|
-
version: 0.1.
|
|
18
|
+
version: 0.1.12
|
|
19
19
|
type: :runtime
|
|
20
20
|
prerelease: false
|
|
21
21
|
version_requirements: !ruby/object:Gem::Requirement
|
|
22
22
|
requirements:
|
|
23
23
|
- - ">="
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
|
-
version: 0.1.
|
|
25
|
+
version: 0.1.12
|
|
26
26
|
- !ruby/object:Gem::Dependency
|
|
27
27
|
name: ruby-progressbar
|
|
28
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -177,8 +177,9 @@ dependencies:
|
|
|
177
177
|
- - ">="
|
|
178
178
|
- !ruby/object:Gem::Version
|
|
179
179
|
version: '0'
|
|
180
|
-
description: Command-line interface for Ragdoll RAG system
|
|
181
|
-
|
|
180
|
+
description: Command-line interface for Ragdoll's unified text-based RAG system. Converts
|
|
181
|
+
all media types (images, audio, documents) to searchable text for powerful cross-modal
|
|
182
|
+
search capabilities. Under development. Contributors welcome.
|
|
182
183
|
email:
|
|
183
184
|
- dvanhoozer@gmail.com
|
|
184
185
|
executables:
|
|
@@ -227,7 +228,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
227
228
|
- !ruby/object:Gem::Version
|
|
228
229
|
version: '0'
|
|
229
230
|
requirements: []
|
|
230
|
-
rubygems_version: 3.7.
|
|
231
|
+
rubygems_version: 3.7.2
|
|
231
232
|
specification_version: 4
|
|
232
|
-
summary:
|
|
233
|
+
summary: Unified Text-Based RAG CLI for Cross-Modal Search
|
|
233
234
|
test_files: []
|