ragdoll-cli 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +49 -14
- data/lib/ragdoll/cli/standalone_client.rb +7 -1
- data/lib/ragdoll/cli/version.rb +1 -1
- data/lib/ragdoll/cli.rb +49 -10
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 89c5200b031e287b125fb93d7025ea666ad3456bdc209fbc68a0dc70b673a410
|
4
|
+
data.tar.gz: 5bfa7dabb52f7b990b5935736ec1710a3b0f4cace386903bb9491bcf4f88720a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3eb42ababba85311e9d6bb3db1b5cf4f63f00e46a4a07ea8e9ec21841a9b4cdea44d36619b168a9298d08393d250917f66c14fa82c72b3866937833b3a44bae
|
7
|
+
data.tar.gz: 2a5085c3c38a9147f2bf26fb8a4ad2814bccdb05dfa3d5d76f5cb7ec21319d82d969758eece5b066f01b50100efa85ae9a9abfa20badca70d9144b5bd52c7906
|
data/README.md
CHANGED
@@ -42,9 +42,9 @@ This will install the `ragdoll` command-line tool.
|
|
42
42
|
export OPENAI_API_KEY=your_api_key_here
|
43
43
|
```
|
44
44
|
|
45
|
-
3. **
|
45
|
+
3. **Add documents:**
|
46
46
|
```bash
|
47
|
-
ragdoll
|
47
|
+
ragdoll add docs/*.pdf --recursive
|
48
48
|
```
|
49
49
|
|
50
50
|
4. **Search for content:**
|
@@ -77,21 +77,57 @@ ragdoll config path
|
|
77
77
|
ragdoll config database
|
78
78
|
```
|
79
79
|
|
80
|
-
### Document
|
80
|
+
### Document Management
|
81
81
|
|
82
82
|
```bash
|
83
|
-
#
|
84
|
-
ragdoll
|
83
|
+
# Add a single document
|
84
|
+
ragdoll add document.pdf
|
85
|
+
|
86
|
+
# Add multiple documents and directories
|
87
|
+
ragdoll add file1.pdf file2.txt ../docs
|
85
88
|
|
86
|
-
#
|
87
|
-
ragdoll
|
89
|
+
# Add files matching a pattern
|
90
|
+
ragdoll add "documents/*.pdf"
|
91
|
+
|
92
|
+
# Add recursively from directory (default: true)
|
93
|
+
ragdoll add "docs/" --recursive
|
88
94
|
|
89
95
|
# Filter by document type
|
90
|
-
ragdoll
|
96
|
+
ragdoll add "files/*" --type pdf
|
97
|
+
|
98
|
+
# Available types: pdf, docx, txt, md, html
|
99
|
+
|
100
|
+
# Skip confirmation prompts
|
101
|
+
ragdoll add docs/ --skip-confirmation
|
102
|
+
|
103
|
+
# Force addition of duplicate documents
|
104
|
+
ragdoll add document.pdf --force-duplicate
|
91
105
|
|
92
106
|
# Available types: pdf, docx, txt, md, html
|
93
107
|
```
|
94
108
|
|
109
|
+
#### Duplicate Detection
|
110
|
+
|
111
|
+
Ragdoll automatically detects and prevents duplicate documents from being processed:
|
112
|
+
|
113
|
+
```bash
|
114
|
+
# Normal behavior - duplicates are detected and skipped
|
115
|
+
ragdoll add document.pdf
|
116
|
+
ragdoll add document.pdf # Skipped (duplicate detected)
|
117
|
+
|
118
|
+
# Force addition of duplicates when needed
|
119
|
+
ragdoll add document.pdf --force-duplicate # Creates new document despite duplicate
|
120
|
+
|
121
|
+
# Batch processing safely handles mixed new/duplicate files
|
122
|
+
ragdoll add docs/*.pdf # Only processes new files, skips duplicates
|
123
|
+
```
|
124
|
+
|
125
|
+
**Duplicate Detection Features:**
|
126
|
+
- **File-based detection**: Compares file location, modification time, and SHA256 hash
|
127
|
+
- **Content-based detection**: Compares extracted text content and metadata
|
128
|
+
- **Smart similarity**: Detects duplicates even with minor differences (5% tolerance)
|
129
|
+
- **Performance optimized**: Uses database indexes for fast duplicate lookups
|
130
|
+
|
95
131
|
### Search
|
96
132
|
|
97
133
|
```bash
|
@@ -136,12 +172,9 @@ ragdoll search "backpropagation algorithm" --search-type fulltext
|
|
136
172
|
ragdoll search "transformer architecture" --search-type hybrid --semantic-weight 0.7 --text-weight 0.3
|
137
173
|
```
|
138
174
|
|
139
|
-
### Document
|
175
|
+
### Document Operations
|
140
176
|
|
141
177
|
```bash
|
142
|
-
# Add a single document
|
143
|
-
ragdoll add <path>
|
144
|
-
|
145
178
|
# List all documents
|
146
179
|
ragdoll list
|
147
180
|
|
@@ -155,6 +188,10 @@ ragdoll list --format plain
|
|
155
188
|
# Check document status
|
156
189
|
ragdoll status <id>
|
157
190
|
|
191
|
+
# Show detailed document information
|
192
|
+
ragdoll show <id>
|
193
|
+
ragdoll show <id> --format json
|
194
|
+
|
158
195
|
# Update document metadata
|
159
196
|
ragdoll update <id> --title "New Title"
|
160
197
|
|
@@ -164,8 +201,6 @@ ragdoll delete <id> --force # Bypass confirmation
|
|
164
201
|
|
165
202
|
# Show system statistics
|
166
203
|
ragdoll stats
|
167
|
-
ragdoll stats --format json
|
168
|
-
ragdoll stats --format plain
|
169
204
|
```
|
170
205
|
|
171
206
|
### Retrieval Utilities
|
@@ -6,7 +6,13 @@ module Ragdoll
|
|
6
6
|
include DebugMe
|
7
7
|
|
8
8
|
def add_document(path, **options)
|
9
|
-
|
9
|
+
# Map force_duplicate option to force parameter for core library
|
10
|
+
core_options = options.dup
|
11
|
+
if core_options.key?(:force_duplicate)
|
12
|
+
core_options[:force] = core_options.delete(:force_duplicate)
|
13
|
+
end
|
14
|
+
|
15
|
+
Ragdoll.add_document(path: path, **core_options)
|
10
16
|
end
|
11
17
|
|
12
18
|
|
data/lib/ragdoll/cli/version.rb
CHANGED
data/lib/ragdoll/cli.rb
CHANGED
@@ -285,17 +285,22 @@ module Ragdoll
|
|
285
285
|
desc: 'Recursively process subdirectories (default: true)'
|
286
286
|
method_option :type, type: :string, aliases: '-t',
|
287
287
|
desc: 'Filter by document type (pdf, docx, txt, md, html)'
|
288
|
-
method_option :
|
288
|
+
method_option :skip_confirmation, type: :boolean, default: false, aliases: '-y',
|
289
289
|
desc: 'Skip confirmation prompts'
|
290
|
+
method_option :force_duplicate, type: :boolean, default: false, aliases: '-f',
|
291
|
+
desc: 'Force addition of duplicate documents (bypasses duplicate detection)'
|
290
292
|
def add(*paths)
|
291
293
|
if paths.empty?
|
292
294
|
puts 'Error: No paths provided'
|
293
|
-
puts 'Usage: ragdoll add PATH [PATH2] [PATH3]...'
|
295
|
+
puts 'Usage: ragdoll add PATH [PATH2] [PATH3]... [OPTIONS]'
|
294
296
|
puts 'Examples:'
|
295
297
|
puts ' ragdoll add file.pdf'
|
296
298
|
puts ' ragdoll add ../docs'
|
297
299
|
puts ' ragdoll add ../docs/**/*.md'
|
298
300
|
puts ' ragdoll add file1.txt file2.pdf ../docs'
|
301
|
+
puts ' ragdoll add file.pdf --force-duplicate # Force add even if duplicate'
|
302
|
+
puts ' ragdoll add ../docs --type=pdf # Only process PDF files'
|
303
|
+
puts ' ragdoll add ../docs --skip-confirmation # Skip prompts'
|
299
304
|
exit 1
|
300
305
|
end
|
301
306
|
|
@@ -338,12 +343,16 @@ module Ragdoll
|
|
338
343
|
|
339
344
|
progressbar.finish
|
340
345
|
|
341
|
-
# Summary
|
346
|
+
# Summary with duplicate detection information
|
342
347
|
success_count = all_results.count { |r| r && r[:status] == 'success' }
|
343
348
|
error_count = all_results.count { |r| r && r[:status] == 'error' }
|
349
|
+
duplicate_count = all_results.count { |r| r && r[:status] == 'success' && r[:duplicate] }
|
350
|
+
new_count = success_count - duplicate_count
|
344
351
|
|
345
352
|
puts "\nCompleted:"
|
346
|
-
puts " Successfully
|
353
|
+
puts " Successfully processed: #{success_count} files"
|
354
|
+
puts " New documents: #{new_count}"
|
355
|
+
puts " Duplicates #{options[:force_duplicate] ? 'forced' : 'detected'}: #{duplicate_count}" if duplicate_count > 0
|
347
356
|
puts " Errors: #{error_count} files"
|
348
357
|
|
349
358
|
if error_count > 0
|
@@ -355,10 +364,33 @@ module Ragdoll
|
|
355
364
|
|
356
365
|
return unless success_count > 0
|
357
366
|
|
358
|
-
|
359
|
-
all_results.select { |r| r && r[:status] == 'success'
|
360
|
-
|
361
|
-
puts "
|
367
|
+
# Show new documents
|
368
|
+
new_documents = all_results.select { |r| r && r[:status] == 'success' && !r[:duplicate] }
|
369
|
+
if new_documents.any?
|
370
|
+
puts "\nNew documents added:"
|
371
|
+
new_documents.each do |result|
|
372
|
+
puts " #{result[:file]} (ID: #{result[:document_id]})"
|
373
|
+
puts " #{result[:message]}" if result[:message]
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
# Show duplicate information
|
378
|
+
duplicate_documents = all_results.select { |r| r && r[:status] == 'success' && r[:duplicate] }
|
379
|
+
if duplicate_documents.any?
|
380
|
+
if options[:force_duplicate]
|
381
|
+
puts "\nDuplicates forced to be added:"
|
382
|
+
duplicate_documents.each do |result|
|
383
|
+
puts " #{result[:file]} (ID: #{result[:document_id]})"
|
384
|
+
puts " #{result[:message]}" if result[:message]
|
385
|
+
end
|
386
|
+
else
|
387
|
+
puts "\nDuplicates detected (skipped):"
|
388
|
+
duplicate_documents.each do |result|
|
389
|
+
puts " #{result[:file]} (existing ID: #{result[:document_id]})"
|
390
|
+
puts " #{result[:message]}" if result[:message]
|
391
|
+
end
|
392
|
+
puts "\nTip: Use --force-duplicate (-f) to force adding duplicates"
|
393
|
+
end
|
362
394
|
end
|
363
395
|
|
364
396
|
puts "\nNote: Documents are being processed in the background."
|
@@ -412,12 +444,19 @@ module Ragdoll
|
|
412
444
|
|
413
445
|
def process_single_file(client, path, options)
|
414
446
|
begin
|
415
|
-
|
447
|
+
# Pass force_duplicate parameter for duplicate detection
|
448
|
+
result = client.add_document(path, force_duplicate: options[:force_duplicate])
|
449
|
+
|
450
|
+
# Determine if this was a duplicate detection
|
451
|
+
duplicate_detected = result[:duplicate] || (result[:message] && result[:message].include?('already exists'))
|
452
|
+
|
416
453
|
{
|
417
454
|
file: path,
|
418
455
|
document_id: result[:document_id],
|
419
456
|
status: result[:success] ? 'success' : 'error',
|
420
|
-
message: result[:message]
|
457
|
+
message: result[:message],
|
458
|
+
duplicate: duplicate_detected,
|
459
|
+
forced: options[:force_duplicate]
|
421
460
|
}
|
422
461
|
rescue StandardError => e
|
423
462
|
{
|