ragdoll 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +75 -0
- data/Rakefile +10 -0
- data/app/models/ragdoll/document.rb +9 -0
- data/app/models/ragdoll/embedding.rb +9 -0
- data/config/initializers/ragdoll.rb +6 -0
- data/config/routes.rb +5 -0
- data/db/migrate/20250218123456_create_documents.rb +20 -0
- data/lib/config/database.yml +28 -0
- data/lib/config/ragdoll.yml +31 -0
- data/lib/ragdoll/engine.rb +16 -0
- data/lib/ragdoll/import_job.rb +15 -0
- data/lib/ragdoll/ingestion.rb +30 -0
- data/lib/ragdoll/search.rb +18 -0
- data/lib/ragdoll/version.rb +7 -0
- data/lib/ragdoll.rb +12 -0
- data/lib/tasks/import_task.thor +32 -0
- data/lib/tasks/jobs_task.thor +40 -0
- data/lib/tasks/ragdoll_tasks.thor +7 -0
- data/lib/tasks/search_task.thor +55 -0
- metadata +77 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bea4621e2b802db79d78f8b1d0679cf2f81ed35b91d35683ce0afcb83ddc54e1
|
4
|
+
data.tar.gz: ec12fb975b154f77a42d54fb3c716d523e1b90e4e0122b3576c5aac15a957340
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3702308d3b772dfc0ebf429a26bae0f0378456e9d6c48357b8e2a4cdeb3744e78b43fd610ef308ff6348f1bec28bb37d51bd5e335d78583574b3212c8f544a33
|
7
|
+
data.tar.gz: 9beebfebafe1ed2e815a3042949b68e5f208d8c555a92228eb4098fa99900f07d2985654766e2907de3069f5d10b6cf3e65fb7a3b431ee9db836e6111f1e27f2
|
data/README.md
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# Ragdoll
|
2
|
+
|
3
|
+
Ragdoll is a Rails Engine designed for document ingestion and search. It allows you to import documents, vectorize them, and perform searches using vector representations.
|
4
|
+
|
5
|
+
## Installation as a Rails Engine
|
6
|
+
|
7
|
+
To use Ragdoll as a Rails Engine, add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
bundle add ragdoll
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
```bash
|
16
|
+
bundle install
|
17
|
+
```
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
```bash
|
22
|
+
gem install ragdoll
|
23
|
+
```
|
24
|
+
|
25
|
+
## Usage as a Rails Engine
|
26
|
+
|
27
|
+
### Importing Documents
|
28
|
+
|
29
|
+
To import documents from a file, glob, or directory, use the following command:
|
30
|
+
|
31
|
+
```bash
|
32
|
+
ragdoll import PATH
|
33
|
+
```
|
34
|
+
|
35
|
+
- `PATH`: The path to the file or directory to import.
|
36
|
+
- Use the `-r` or `--recursive` option to import files recursively from directories.
|
37
|
+
- Use the `-j` or `--jobs` option to specify the number of concurrent import jobs.
|
38
|
+
|
39
|
+
### Managing Jobs
|
40
|
+
|
41
|
+
To manage import jobs, use the following command:
|
42
|
+
|
43
|
+
```bash
|
44
|
+
ragdoll jobs [JOB_ID]
|
45
|
+
```
|
46
|
+
|
47
|
+
- `JOB_ID`: The ID of a specific job to manage.
|
48
|
+
- Use `--stop`, `--pause`, or `--resume` to control a specific job.
|
49
|
+
- Use `--stop-all`, `--pause-all`, or `--resume-all` to control all jobs.
|
50
|
+
|
51
|
+
### Searching Documents
|
52
|
+
|
53
|
+
To search the database with a prompt, use the following command:
|
54
|
+
|
55
|
+
```bash
|
56
|
+
ragdoll search PROMPT
|
57
|
+
```
|
58
|
+
|
59
|
+
- `PROMPT`: The search prompt as a string or use the `-p` option to specify a file containing the prompt text.
|
60
|
+
- Use the `--max_count` option to specify the maximum number of results to return.
|
61
|
+
- Use the `--rerank` option to rerank results using keyword search.
|
62
|
+
|
63
|
+
## Development and Contribution
|
64
|
+
|
65
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
66
|
+
|
67
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
68
|
+
|
69
|
+
## Contributing
|
70
|
+
|
71
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/ragdoll.
|
72
|
+
|
73
|
+
## License
|
74
|
+
|
75
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/config/routes.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# This migration creates the documents table with necessary extensions for PostgreSQL.
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
class CreateDocuments < ActiveRecord::Migration[7.0]
|
5
|
+
def change
|
6
|
+
enable_extension 'pg_trgm'
|
7
|
+
enable_extension 'fuzzystrmatch'
|
8
|
+
|
9
|
+
create_table :documents do |t|
|
10
|
+
t.string :location
|
11
|
+
t.string :summary
|
12
|
+
t.string :type
|
13
|
+
t.datetime :processing_started_at
|
14
|
+
t.datetime :processing_finished_at
|
15
|
+
|
16
|
+
t.timestamps
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This file contains the database configuration for the Ragdoll gem, using environment variables.
|
2
|
+
|
3
|
+
default: &default
|
4
|
+
adapter: postgresql
|
5
|
+
encoding: unicode
|
6
|
+
pool: <%= ENV.fetch("RAGDOLL_POOL", 5) %>
|
7
|
+
timeout: <%= ENV.fetch("RAGDOLL_TIMEOUT", 5000) %>
|
8
|
+
|
9
|
+
development:
|
10
|
+
<<: *default
|
11
|
+
host: <%= ENV.fetch("RAGDOLL_HOST", "localhost") %>
|
12
|
+
database: <%= ENV.fetch("RAGDOLL_DATABASE", "ragdoll_development") %>
|
13
|
+
username: <%= ENV.fetch("RAGDOLL_USER", "user") %>
|
14
|
+
password: <%= ENV.fetch("RAGDOLL_PASSWORD", "password") %>
|
15
|
+
|
16
|
+
test:
|
17
|
+
<<: *default
|
18
|
+
host: <%= ENV.fetch("RAGDOLL_HOST", "localhost") %>
|
19
|
+
database: <%= ENV.fetch("RAGDOLL_DATABASE", "ragdoll_test") %>
|
20
|
+
username: <%= ENV.fetch("RAGDOLL_USER", "user") %>
|
21
|
+
password: <%= ENV.fetch("RAGDOLL_PASSWORD", "password") %>
|
22
|
+
|
23
|
+
production:
|
24
|
+
<<: *default
|
25
|
+
host: <%= ENV.fetch("RAGDOLL_HOST") %>
|
26
|
+
database: <%= ENV.fetch("RAGDOLL_DATABASE") %>
|
27
|
+
username: <%= ENV.fetch("RAGDOLL_USER") %>
|
28
|
+
password: <%= ENV.fetch("RAGDOLL_PASSWORD") %>
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# This file contains the default configuration settings for the Ragdoll gem, including database configurations.
|
2
|
+
|
3
|
+
default: &default
|
4
|
+
database:
|
5
|
+
host: localhost
|
6
|
+
database: ragdoll_development
|
7
|
+
user: user
|
8
|
+
password: password
|
9
|
+
pool: 5
|
10
|
+
timeout: 5000
|
11
|
+
|
12
|
+
llm:
|
13
|
+
embeddings_model: "llama-2-7b"
|
14
|
+
reranking_model: "llama-2-13b"
|
15
|
+
chat_model: "llama-2-70b"
|
16
|
+
|
17
|
+
development:
|
18
|
+
<<: *default
|
19
|
+
|
20
|
+
test:
|
21
|
+
<<: *default
|
22
|
+
database:
|
23
|
+
database: ragdoll_test
|
24
|
+
|
25
|
+
production:
|
26
|
+
<<: *default
|
27
|
+
database:
|
28
|
+
host: <%= ENV.fetch("RAGDOLL_HOST") %>
|
29
|
+
database: <%= ENV.fetch("RAGDOLL_DATABASE") %>
|
30
|
+
user: <%= ENV.fetch("RAGDOLL_USER") %>
|
31
|
+
password: <%= ENV.fetch("RAGDOLL_PASSWORD") %>
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# This file defines the Ragdoll engine, which integrates the gem with Rails applications.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require "rails/engine"
|
6
|
+
|
7
|
+
module Ragdoll
|
8
|
+
class Engine < ::Rails::Engine
|
9
|
+
isolate_namespace Ragdoll
|
10
|
+
config.generators do |g|
|
11
|
+
g.test_framework :minitest
|
12
|
+
g.fixture_replacement :factory_bot
|
13
|
+
g.factory_bot dir: 'test/factories'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# This file defines the ImportJob class for handling document import tasks in the background.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class ImportJob < SolidJob::Base
|
7
|
+
def perform(file)
|
8
|
+
document = File.read(file)
|
9
|
+
ingestion = Ragdoll::Ingestion.new(document)
|
10
|
+
vectorized_chunks = ingestion.chunk_and_vectorize
|
11
|
+
ingestion.store_in_database
|
12
|
+
puts "Imported #{file} successfully."
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# This file contains the Ingestion class responsible for processing documents by chunking and vectorizing them.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class Ingestion
|
7
|
+
def initialize(document)
|
8
|
+
@document = document
|
9
|
+
end
|
10
|
+
|
11
|
+
def chunk_and_vectorize
|
12
|
+
# Example logic for chunking and vectorization
|
13
|
+
chunks = @document.split("\n\n") # Split document into paragraphs
|
14
|
+
vectorized_chunks = chunks.map { |chunk| vectorize(chunk) }
|
15
|
+
vectorized_chunks
|
16
|
+
end
|
17
|
+
|
18
|
+
def store_in_database
|
19
|
+
# Implement logic to store vectorized data in the database
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def vectorize(chunk)
|
25
|
+
# Placeholder for vectorization logic
|
26
|
+
# Convert chunk to a vector representation
|
27
|
+
chunk.split.map(&:downcase) # Simple example: split words and downcase
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# This file contains the Search class responsible for querying the database with a prompt.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class Search
|
7
|
+
def initialize(prompt)
|
8
|
+
@prompt = prompt
|
9
|
+
end
|
10
|
+
|
11
|
+
def search_database(max_count)
|
12
|
+
# Example logic for searching the database
|
13
|
+
# This is a placeholder for actual database search logic
|
14
|
+
results = [] # Placeholder for actual database query results
|
15
|
+
results.select { |entry| entry.include?(@prompt) }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/ragdoll.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# This file is the main entry point for the Ragdoll gem, requiring all necessary components.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
# frozen_string_literal: true
|
6
|
+
|
7
|
+
require "ragdoll/version"
|
8
|
+
require "ragdoll/engine"
|
9
|
+
|
10
|
+
module Ragdoll
|
11
|
+
class Error < StandardError; end
|
12
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require_relative '../ragdoll/import_job'
|
5
|
+
|
6
|
+
module Ragdoll
|
7
|
+
class ImportTask < Thor
|
8
|
+
desc "import PATH", "Import documents from a file, glob, or directory"
|
9
|
+
method_option :recursive, aliases: "-r", type: :boolean, default: false, desc: "Recursively import files from directories"
|
10
|
+
method_option :jobs, aliases: ["-j", "--jobs"], type: :numeric, default: 1, desc: "Number of concurrent import jobs"
|
11
|
+
def import(path)
|
12
|
+
queue = SolidQueue.new(concurrency: options[:jobs])
|
13
|
+
files = if File.directory?(path)
|
14
|
+
if options[:recursive]
|
15
|
+
Dir.glob("#{path}/**/*")
|
16
|
+
else
|
17
|
+
Dir.glob("#{path}/*")
|
18
|
+
end
|
19
|
+
else
|
20
|
+
[path]
|
21
|
+
end
|
22
|
+
|
23
|
+
files.each do |file|
|
24
|
+
next unless File.file?(file)
|
25
|
+
|
26
|
+
queue.push(file) do |file|
|
27
|
+
Ragdoll::ImportJob.perform_async(file)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class JobsTask < Thor
|
7
|
+
desc "jobs [JOB_ID]", "Report the status of all running and queued import jobs, or a specific job if JOB_ID is provided"
|
8
|
+
method_option :stop_all, type: :boolean, default: false, desc: "Stop all running and queued jobs"
|
9
|
+
method_option :pause_all, type: :boolean, default: false, desc: "Pause all running jobs"
|
10
|
+
method_option :resume_all, type: :boolean, default: false, desc: "Resume all paused jobs"
|
11
|
+
method_option :stop, type: :boolean, default: false, desc: "Stop a specific job"
|
12
|
+
method_option :pause, type: :boolean, default: false, desc: "Pause a specific job"
|
13
|
+
method_option :resume, type: :boolean, default: false, desc: "Resume a specific job"
|
14
|
+
def jobs(job_id = nil)
|
15
|
+
if job_id
|
16
|
+
if options[:stop]
|
17
|
+
puts "Stopping job ID: #{job_id}..."
|
18
|
+
elsif options[:pause]
|
19
|
+
puts "Pausing job ID: #{job_id}..."
|
20
|
+
elsif options[:resume]
|
21
|
+
puts "Resuming job ID: #{job_id}..."
|
22
|
+
else
|
23
|
+
puts "Fetching status for job ID: #{job_id}..."
|
24
|
+
end
|
25
|
+
else
|
26
|
+
if options[:stop_all]
|
27
|
+
puts "Stopping all jobs..."
|
28
|
+
elsif options[:pause_all]
|
29
|
+
puts "Pausing all running jobs..."
|
30
|
+
elsif options[:resume_all]
|
31
|
+
puts "Resuming all paused jobs..."
|
32
|
+
else
|
33
|
+
puts "Fetching status of all running and queued import jobs..."
|
34
|
+
puts "Job ID: 12345, Status: Running, File: document1.txt"
|
35
|
+
puts "Job ID: 12346, Status: Running, File: document2.txt"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require_relative '../ragdoll/search'
|
5
|
+
|
6
|
+
module Ragdoll
|
7
|
+
class SearchTask < Thor
|
8
|
+
desc "search PROMPT", "Search the database with a prompt"
|
9
|
+
method_option :prompt, aliases: ["-p", "--prompt"], type: :string, desc: "File path containing the prompt text"
|
10
|
+
method_option :max_count, type: :numeric, default: 10, desc: "Maximum number of results to return"
|
11
|
+
method_option :rerank, type: :boolean, default: false, desc: "Rerank results using keyword search"
|
12
|
+
def search(prompt = nil)
|
13
|
+
if options[:prompt]
|
14
|
+
prompt = File.read(options[:prompt])
|
15
|
+
end
|
16
|
+
|
17
|
+
unless prompt
|
18
|
+
puts "Please provide a prompt as a string or with the -p option."
|
19
|
+
return
|
20
|
+
end
|
21
|
+
|
22
|
+
keywords = extract_keywords(prompt)
|
23
|
+
vectorized_prompt = vectorize_prompt(prompt)
|
24
|
+
search_instance = Ragdoll::Search.new(vectorized_prompt)
|
25
|
+
results = search_instance.search_database(options[:max_count])
|
26
|
+
|
27
|
+
if options[:rerank]
|
28
|
+
results = rerank_results(results, keywords)
|
29
|
+
end
|
30
|
+
|
31
|
+
results.each do |result|
|
32
|
+
puts "Source: #{result[:source]}"
|
33
|
+
puts "Metadata: #{result[:metadata]}"
|
34
|
+
puts "--------------------------------"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def rerank_results(results, keywords)
|
41
|
+
results.sort_by do |result|
|
42
|
+
content = result[:source].downcase
|
43
|
+
keywords.count { |keyword| content.include?(keyword) }
|
44
|
+
end.reverse
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_keywords(prompt)
|
48
|
+
prompt.split.map(&:downcase).uniq
|
49
|
+
end
|
50
|
+
|
51
|
+
def vectorize_prompt(prompt)
|
52
|
+
prompt.split.map(&:downcase)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ragdoll
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dewayne VanHoozer
|
8
|
+
bindir: bin
|
9
|
+
cert_chain: []
|
10
|
+
date: 2025-02-19 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: rails
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '7.1'
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '7.1'
|
26
|
+
description: Under development. Contributors welcome.
|
27
|
+
email:
|
28
|
+
- dvanhoozer@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- README.md
|
34
|
+
- Rakefile
|
35
|
+
- app/models/ragdoll/document.rb
|
36
|
+
- app/models/ragdoll/embedding.rb
|
37
|
+
- config/initializers/ragdoll.rb
|
38
|
+
- config/routes.rb
|
39
|
+
- db/migrate/20250218123456_create_documents.rb
|
40
|
+
- lib/config/database.yml
|
41
|
+
- lib/config/ragdoll.yml
|
42
|
+
- lib/ragdoll.rb
|
43
|
+
- lib/ragdoll/engine.rb
|
44
|
+
- lib/ragdoll/import_job.rb
|
45
|
+
- lib/ragdoll/ingestion.rb
|
46
|
+
- lib/ragdoll/search.rb
|
47
|
+
- lib/ragdoll/version.rb
|
48
|
+
- lib/tasks/import_task.thor
|
49
|
+
- lib/tasks/jobs_task.thor
|
50
|
+
- lib/tasks/ragdoll_tasks.thor
|
51
|
+
- lib/tasks/search_task.thor
|
52
|
+
homepage: https://github.com/MadBomber/ragdoll
|
53
|
+
licenses:
|
54
|
+
- MIT
|
55
|
+
metadata:
|
56
|
+
allowed_push_host: https://rubygems.org
|
57
|
+
homepage_uri: https://github.com/MadBomber/ragdoll
|
58
|
+
source_code_uri: https://github.com/MadBomber/ragdoll
|
59
|
+
changelog_uri: https://github.com/MadBomber/ragdoll/blob/main/CHANGELOG.md
|
60
|
+
rdoc_options: []
|
61
|
+
require_paths:
|
62
|
+
- lib
|
63
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: 3.1.0
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '0'
|
73
|
+
requirements: []
|
74
|
+
rubygems_version: 3.6.3
|
75
|
+
specification_version: 4
|
76
|
+
summary: Ruby on Rails Engine
|
77
|
+
test_files: []
|