ragdoll 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +75 -0
- data/Rakefile +10 -0
- data/app/models/ragdoll/document.rb +9 -0
- data/app/models/ragdoll/embedding.rb +9 -0
- data/config/initializers/ragdoll.rb +6 -0
- data/config/routes.rb +5 -0
- data/db/migrate/20250218123456_create_documents.rb +20 -0
- data/lib/config/database.yml +28 -0
- data/lib/config/ragdoll.yml +31 -0
- data/lib/ragdoll/engine.rb +16 -0
- data/lib/ragdoll/import_job.rb +15 -0
- data/lib/ragdoll/ingestion.rb +30 -0
- data/lib/ragdoll/search.rb +18 -0
- data/lib/ragdoll/version.rb +7 -0
- data/lib/ragdoll.rb +12 -0
- data/lib/tasks/import_task.thor +32 -0
- data/lib/tasks/jobs_task.thor +40 -0
- data/lib/tasks/ragdoll_tasks.thor +7 -0
- data/lib/tasks/search_task.thor +55 -0
- metadata +77 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bea4621e2b802db79d78f8b1d0679cf2f81ed35b91d35683ce0afcb83ddc54e1
|
4
|
+
data.tar.gz: ec12fb975b154f77a42d54fb3c716d523e1b90e4e0122b3576c5aac15a957340
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3702308d3b772dfc0ebf429a26bae0f0378456e9d6c48357b8e2a4cdeb3744e78b43fd610ef308ff6348f1bec28bb37d51bd5e335d78583574b3212c8f544a33
|
7
|
+
data.tar.gz: 9beebfebafe1ed2e815a3042949b68e5f208d8c555a92228eb4098fa99900f07d2985654766e2907de3069f5d10b6cf3e65fb7a3b431ee9db836e6111f1e27f2
|
data/README.md
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# Ragdoll
|
2
|
+
|
3
|
+
Ragdoll is a Rails Engine designed for document ingestion and search. It allows you to import documents, vectorize them, and perform searches using vector representations.
|
4
|
+
|
5
|
+
## Installation as a Rails Engine
|
6
|
+
|
7
|
+
To use Ragdoll as a Rails Engine, add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
bundle add ragdoll
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
```bash
|
16
|
+
bundle install
|
17
|
+
```
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
```bash
|
22
|
+
gem install ragdoll
|
23
|
+
```
|
24
|
+
|
25
|
+
## Usage as a Rails Engine
|
26
|
+
|
27
|
+
### Importing Documents
|
28
|
+
|
29
|
+
To import documents from a file, glob, or directory, use the following command:
|
30
|
+
|
31
|
+
```bash
|
32
|
+
ragdoll import PATH
|
33
|
+
```
|
34
|
+
|
35
|
+
- `PATH`: The path to the file or directory to import.
|
36
|
+
- Use the `-r` or `--recursive` option to import files recursively from directories.
|
37
|
+
- Use the `-j` or `--jobs` option to specify the number of concurrent import jobs.
|
38
|
+
|
39
|
+
### Managing Jobs
|
40
|
+
|
41
|
+
To manage import jobs, use the following command:
|
42
|
+
|
43
|
+
```bash
|
44
|
+
ragdoll jobs [JOB_ID]
|
45
|
+
```
|
46
|
+
|
47
|
+
- `JOB_ID`: The ID of a specific job to manage.
|
48
|
+
- Use `--stop`, `--pause`, or `--resume` to control a specific job.
|
49
|
+
- Use `--stop-all`, `--pause-all`, or `--resume-all` to control all jobs.
|
50
|
+
|
51
|
+
### Searching Documents
|
52
|
+
|
53
|
+
To search the database with a prompt, use the following command:
|
54
|
+
|
55
|
+
```bash
|
56
|
+
ragdoll search PROMPT
|
57
|
+
```
|
58
|
+
|
59
|
+
- `PROMPT`: The search prompt as a string or use the `-p` option to specify a file containing the prompt text.
|
60
|
+
- Use the `--max_count` option to specify the maximum number of results to return.
|
61
|
+
- Use the `--rerank` option to rerank results using keyword search.
|
62
|
+
|
63
|
+
## Development and Contribution
|
64
|
+
|
65
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
66
|
+
|
67
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
68
|
+
|
69
|
+
## Contributing
|
70
|
+
|
71
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/ragdoll.
|
72
|
+
|
73
|
+
## License
|
74
|
+
|
75
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/config/routes.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# This migration creates the documents table with necessary extensions for PostgreSQL.
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
class CreateDocuments < ActiveRecord::Migration[7.0]
|
5
|
+
def change
|
6
|
+
enable_extension 'pg_trgm'
|
7
|
+
enable_extension 'fuzzystrmatch'
|
8
|
+
|
9
|
+
create_table :documents do |t|
|
10
|
+
t.string :location
|
11
|
+
t.string :summary
|
12
|
+
t.string :type
|
13
|
+
t.datetime :processing_started_at
|
14
|
+
t.datetime :processing_finished_at
|
15
|
+
|
16
|
+
t.timestamps
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This file contains the database configuration for the Ragdoll gem, using environment variables.
|
2
|
+
|
3
|
+
default: &default
|
4
|
+
adapter: postgresql
|
5
|
+
encoding: unicode
|
6
|
+
pool: <%= ENV.fetch("RAGDOLL_POOL", 5) %>
|
7
|
+
timeout: <%= ENV.fetch("RAGDOLL_TIMEOUT", 5000) %>
|
8
|
+
|
9
|
+
development:
|
10
|
+
<<: *default
|
11
|
+
host: <%= ENV.fetch("RAGDOLL_HOST", "localhost") %>
|
12
|
+
database: <%= ENV.fetch("RAGDOLL_DATABASE", "ragdoll_development") %>
|
13
|
+
username: <%= ENV.fetch("RAGDOLL_USER", "user") %>
|
14
|
+
password: <%= ENV.fetch("RAGDOLL_PASSWORD", "password") %>
|
15
|
+
|
16
|
+
test:
|
17
|
+
<<: *default
|
18
|
+
host: <%= ENV.fetch("RAGDOLL_HOST", "localhost") %>
|
19
|
+
database: <%= ENV.fetch("RAGDOLL_DATABASE", "ragdoll_test") %>
|
20
|
+
username: <%= ENV.fetch("RAGDOLL_USER", "user") %>
|
21
|
+
password: <%= ENV.fetch("RAGDOLL_PASSWORD", "password") %>
|
22
|
+
|
23
|
+
production:
|
24
|
+
<<: *default
|
25
|
+
host: <%= ENV.fetch("RAGDOLL_HOST") %>
|
26
|
+
database: <%= ENV.fetch("RAGDOLL_DATABASE") %>
|
27
|
+
username: <%= ENV.fetch("RAGDOLL_USER") %>
|
28
|
+
password: <%= ENV.fetch("RAGDOLL_PASSWORD") %>
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# This file contains the default configuration settings for the Ragdoll gem, including database configurations.
|
2
|
+
|
3
|
+
default: &default
|
4
|
+
database:
|
5
|
+
host: localhost
|
6
|
+
database: ragdoll_development
|
7
|
+
user: user
|
8
|
+
password: password
|
9
|
+
pool: 5
|
10
|
+
timeout: 5000
|
11
|
+
|
12
|
+
llm:
|
13
|
+
embeddings_model: "llama-2-7b"
|
14
|
+
reranking_model: "llama-2-13b"
|
15
|
+
chat_model: "llama-2-70b"
|
16
|
+
|
17
|
+
development:
|
18
|
+
<<: *default
|
19
|
+
|
20
|
+
test:
|
21
|
+
<<: *default
|
22
|
+
database:
|
23
|
+
database: ragdoll_test
|
24
|
+
|
25
|
+
production:
|
26
|
+
<<: *default
|
27
|
+
database:
|
28
|
+
host: <%= ENV.fetch("RAGDOLL_HOST") %>
|
29
|
+
database: <%= ENV.fetch("RAGDOLL_DATABASE") %>
|
30
|
+
user: <%= ENV.fetch("RAGDOLL_USER") %>
|
31
|
+
password: <%= ENV.fetch("RAGDOLL_PASSWORD") %>
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# This file defines the Ragdoll engine, which integrates the gem with Rails applications.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require "rails/engine"
|
6
|
+
|
7
|
+
module Ragdoll
|
8
|
+
class Engine < ::Rails::Engine
|
9
|
+
isolate_namespace Ragdoll
|
10
|
+
config.generators do |g|
|
11
|
+
g.test_framework :minitest
|
12
|
+
g.fixture_replacement :factory_bot
|
13
|
+
g.factory_bot dir: 'test/factories'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# This file defines the ImportJob class for handling document import tasks in the background.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class ImportJob < SolidJob::Base
|
7
|
+
def perform(file)
|
8
|
+
document = File.read(file)
|
9
|
+
ingestion = Ragdoll::Ingestion.new(document)
|
10
|
+
vectorized_chunks = ingestion.chunk_and_vectorize
|
11
|
+
ingestion.store_in_database
|
12
|
+
puts "Imported #{file} successfully."
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# This file contains the Ingestion class responsible for processing documents by chunking and vectorizing them.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class Ingestion
|
7
|
+
def initialize(document)
|
8
|
+
@document = document
|
9
|
+
end
|
10
|
+
|
11
|
+
def chunk_and_vectorize
|
12
|
+
# Example logic for chunking and vectorization
|
13
|
+
chunks = @document.split("\n\n") # Split document into paragraphs
|
14
|
+
vectorized_chunks = chunks.map { |chunk| vectorize(chunk) }
|
15
|
+
vectorized_chunks
|
16
|
+
end
|
17
|
+
|
18
|
+
def store_in_database
|
19
|
+
# Implement logic to store vectorized data in the database
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def vectorize(chunk)
|
25
|
+
# Placeholder for vectorization logic
|
26
|
+
# Convert chunk to a vector representation
|
27
|
+
chunk.split.map(&:downcase) # Simple example: split words and downcase
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# This file contains the Search class responsible for querying the database with a prompt.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class Search
|
7
|
+
def initialize(prompt)
|
8
|
+
@prompt = prompt
|
9
|
+
end
|
10
|
+
|
11
|
+
def search_database(max_count)
|
12
|
+
# Example logic for searching the database
|
13
|
+
# This is a placeholder for actual database search logic
|
14
|
+
results = [] # Placeholder for actual database query results
|
15
|
+
results.select { |entry| entry.include?(@prompt) }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/ragdoll.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# This file is the main entry point for the Ragdoll gem, requiring all necessary components.
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
# frozen_string_literal: true
|
6
|
+
|
7
|
+
require "ragdoll/version"
|
8
|
+
require "ragdoll/engine"
|
9
|
+
|
10
|
+
module Ragdoll
|
11
|
+
class Error < StandardError; end
|
12
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require_relative '../ragdoll/import_job'
|
5
|
+
|
6
|
+
module Ragdoll
|
7
|
+
class ImportTask < Thor
|
8
|
+
desc "import PATH", "Import documents from a file, glob, or directory"
|
9
|
+
method_option :recursive, aliases: "-r", type: :boolean, default: false, desc: "Recursively import files from directories"
|
10
|
+
method_option :jobs, aliases: ["-j", "--jobs"], type: :numeric, default: 1, desc: "Number of concurrent import jobs"
|
11
|
+
def import(path)
|
12
|
+
queue = SolidQueue.new(concurrency: options[:jobs])
|
13
|
+
files = if File.directory?(path)
|
14
|
+
if options[:recursive]
|
15
|
+
Dir.glob("#{path}/**/*")
|
16
|
+
else
|
17
|
+
Dir.glob("#{path}/*")
|
18
|
+
end
|
19
|
+
else
|
20
|
+
[path]
|
21
|
+
end
|
22
|
+
|
23
|
+
files.each do |file|
|
24
|
+
next unless File.file?(file)
|
25
|
+
|
26
|
+
queue.push(file) do |file|
|
27
|
+
Ragdoll::ImportJob.perform_async(file)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class JobsTask < Thor
|
7
|
+
desc "jobs [JOB_ID]", "Report the status of all running and queued import jobs, or a specific job if JOB_ID is provided"
|
8
|
+
method_option :stop_all, type: :boolean, default: false, desc: "Stop all running and queued jobs"
|
9
|
+
method_option :pause_all, type: :boolean, default: false, desc: "Pause all running jobs"
|
10
|
+
method_option :resume_all, type: :boolean, default: false, desc: "Resume all paused jobs"
|
11
|
+
method_option :stop, type: :boolean, default: false, desc: "Stop a specific job"
|
12
|
+
method_option :pause, type: :boolean, default: false, desc: "Pause a specific job"
|
13
|
+
method_option :resume, type: :boolean, default: false, desc: "Resume a specific job"
|
14
|
+
def jobs(job_id = nil)
|
15
|
+
if job_id
|
16
|
+
if options[:stop]
|
17
|
+
puts "Stopping job ID: #{job_id}..."
|
18
|
+
elsif options[:pause]
|
19
|
+
puts "Pausing job ID: #{job_id}..."
|
20
|
+
elsif options[:resume]
|
21
|
+
puts "Resuming job ID: #{job_id}..."
|
22
|
+
else
|
23
|
+
puts "Fetching status for job ID: #{job_id}..."
|
24
|
+
end
|
25
|
+
else
|
26
|
+
if options[:stop_all]
|
27
|
+
puts "Stopping all jobs..."
|
28
|
+
elsif options[:pause_all]
|
29
|
+
puts "Pausing all running jobs..."
|
30
|
+
elsif options[:resume_all]
|
31
|
+
puts "Resuming all paused jobs..."
|
32
|
+
else
|
33
|
+
puts "Fetching status of all running and queued import jobs..."
|
34
|
+
puts "Job ID: 12345, Status: Running, File: document1.txt"
|
35
|
+
puts "Job ID: 12346, Status: Running, File: document2.txt"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require_relative '../ragdoll/search'
|
5
|
+
|
6
|
+
module Ragdoll
|
7
|
+
class SearchTask < Thor
|
8
|
+
desc "search PROMPT", "Search the database with a prompt"
|
9
|
+
method_option :prompt, aliases: ["-p", "--prompt"], type: :string, desc: "File path containing the prompt text"
|
10
|
+
method_option :max_count, type: :numeric, default: 10, desc: "Maximum number of results to return"
|
11
|
+
method_option :rerank, type: :boolean, default: false, desc: "Rerank results using keyword search"
|
12
|
+
def search(prompt = nil)
|
13
|
+
if options[:prompt]
|
14
|
+
prompt = File.read(options[:prompt])
|
15
|
+
end
|
16
|
+
|
17
|
+
unless prompt
|
18
|
+
puts "Please provide a prompt as a string or with the -p option."
|
19
|
+
return
|
20
|
+
end
|
21
|
+
|
22
|
+
keywords = extract_keywords(prompt)
|
23
|
+
vectorized_prompt = vectorize_prompt(prompt)
|
24
|
+
search_instance = Ragdoll::Search.new(vectorized_prompt)
|
25
|
+
results = search_instance.search_database(options[:max_count])
|
26
|
+
|
27
|
+
if options[:rerank]
|
28
|
+
results = rerank_results(results, keywords)
|
29
|
+
end
|
30
|
+
|
31
|
+
results.each do |result|
|
32
|
+
puts "Source: #{result[:source]}"
|
33
|
+
puts "Metadata: #{result[:metadata]}"
|
34
|
+
puts "--------------------------------"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def rerank_results(results, keywords)
|
41
|
+
results.sort_by do |result|
|
42
|
+
content = result[:source].downcase
|
43
|
+
keywords.count { |keyword| content.include?(keyword) }
|
44
|
+
end.reverse
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_keywords(prompt)
|
48
|
+
prompt.split.map(&:downcase).uniq
|
49
|
+
end
|
50
|
+
|
51
|
+
def vectorize_prompt(prompt)
|
52
|
+
prompt.split.map(&:downcase)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ragdoll
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dewayne VanHoozer
|
8
|
+
bindir: bin
|
9
|
+
cert_chain: []
|
10
|
+
date: 2025-02-19 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: rails
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '7.1'
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '7.1'
|
26
|
+
description: Under development. Contributors welcome.
|
27
|
+
email:
|
28
|
+
- dvanhoozer@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- README.md
|
34
|
+
- Rakefile
|
35
|
+
- app/models/ragdoll/document.rb
|
36
|
+
- app/models/ragdoll/embedding.rb
|
37
|
+
- config/initializers/ragdoll.rb
|
38
|
+
- config/routes.rb
|
39
|
+
- db/migrate/20250218123456_create_documents.rb
|
40
|
+
- lib/config/database.yml
|
41
|
+
- lib/config/ragdoll.yml
|
42
|
+
- lib/ragdoll.rb
|
43
|
+
- lib/ragdoll/engine.rb
|
44
|
+
- lib/ragdoll/import_job.rb
|
45
|
+
- lib/ragdoll/ingestion.rb
|
46
|
+
- lib/ragdoll/search.rb
|
47
|
+
- lib/ragdoll/version.rb
|
48
|
+
- lib/tasks/import_task.thor
|
49
|
+
- lib/tasks/jobs_task.thor
|
50
|
+
- lib/tasks/ragdoll_tasks.thor
|
51
|
+
- lib/tasks/search_task.thor
|
52
|
+
homepage: https://github.com/MadBomber/ragdoll
|
53
|
+
licenses:
|
54
|
+
- MIT
|
55
|
+
metadata:
|
56
|
+
allowed_push_host: https://rubygems.org
|
57
|
+
homepage_uri: https://github.com/MadBomber/ragdoll
|
58
|
+
source_code_uri: https://github.com/MadBomber/ragdoll
|
59
|
+
changelog_uri: https://github.com/MadBomber/ragdoll/blob/main/CHANGELOG.md
|
60
|
+
rdoc_options: []
|
61
|
+
require_paths:
|
62
|
+
- lib
|
63
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: 3.1.0
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '0'
|
73
|
+
requirements: []
|
74
|
+
rubygems_version: 3.6.3
|
75
|
+
specification_version: 4
|
76
|
+
summary: Ruby on Rails Engine
|
77
|
+
test_files: []
|