text_splitters 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +59 -0
- data/lib/text_splitters/recursive_character_text_splitter.rb +72 -0
- data/lib/text_splitters/version.rb +3 -0
- data/lib/text_splitters.rb +5 -0
- metadata +53 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7be4926126e20d571d6cdc3b3dd746b40e4831b0f57a618b5af24d9ac3da9ee9
|
4
|
+
data.tar.gz: 9a1e953226c677f60bb9485c5429ac25f359f82b3ff49e6a8ed18de9f3444ceb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 25d6c946a4cf2e805b78a04b69a8700b5a89ada0be2802b90b876151831466b4241e17b34286d6eb9b5c9d766b7864bc1b81a9836ccdbd6a0bda2ee14e00a542
|
7
|
+
data.tar.gz: e60237fc48cbe8d399c201a99c0208af2ade1430bfb1243e75dcedcea2e6cd9f6d06639e84f90a7dfd368c8fbc315b90f76ca0ffc3face95575494c1308bae8e
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2023 Alex Ghiculescu
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
# text_splitters
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/text_splitters.svg)](https://rubygems.org/gems/text_splitters)
|
4
|
+
[![CI](https://github.com/ghiculescu/text_splitters/actions/workflows/ci.yml/badge.svg)](https://github.com/ghiculescu/text_splitters/actions/workflows/ci.yml)
|
5
|
+
[![Code Climate](https://codeclimate.com/github/ghiculescu/text_splitters/badges/gpa.svg)](https://codeclimate.com/github/ghiculescu/text_splitters)
|
6
|
+
|
7
|
+
Port of [langchain](https://github.com/hwchase17/langchain) text splitters to Ruby.
|
8
|
+
|
9
|
+
So far only the `RecursiveCharacterTextSplitter` is implemented. PRs for others are welcome!
|
10
|
+
|
11
|
+
---
|
12
|
+
|
13
|
+
- [Quick start](#quick-start)
|
14
|
+
- [Support](#support)
|
15
|
+
- [License](#license)
|
16
|
+
- [Code of conduct](#code-of-conduct)
|
17
|
+
- [Contribution guide](#contribution-guide)
|
18
|
+
|
19
|
+
## Quick start
|
20
|
+
|
21
|
+
```
|
22
|
+
$ gem install text_splitters
|
23
|
+
```
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
require "text_splitters"
|
27
|
+
```
|
28
|
+
|
29
|
+
## Usage
|
30
|
+
|
31
|
+
### `RecursiveCharacterTextSplitter`
|
32
|
+
|
33
|
+
[Learn more about this splitter](https://langchain.readthedocs.io/en/latest/modules/indexes/examples/textsplitter.html#generic-recursive-text-splitting).
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
text = "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans."
|
37
|
+
splitter = ::TextSplitters::RecursiveCharacterTextSplitter.new(chunk_size: 100, chunk_overlap: 20)
|
38
|
+
|
39
|
+
output = splitter.split(text)
|
40
|
+
|
41
|
+
output[0] # "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet."
|
42
|
+
output[1] # "and the Cabinet. Justices of the Supreme Court. My fellow Americans."
|
43
|
+
```
|
44
|
+
|
45
|
+
## Support
|
46
|
+
|
47
|
+
If you want to report a bug, or have ideas, feedback or questions about the gem, [let me know via GitHub issues](https://github.com/ghiculescu/text_splitters/issues/new) and I will do my best to provide a helpful answer. Happy hacking!
|
48
|
+
|
49
|
+
## License
|
50
|
+
|
51
|
+
The gem is available as open source under the terms of the [MIT License](LICENSE.txt).
|
52
|
+
|
53
|
+
## Code of conduct
|
54
|
+
|
55
|
+
Everyone interacting in this project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
|
56
|
+
|
57
|
+
## Contribution guide
|
58
|
+
|
59
|
+
Pull requests are welcome!
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module TextSplitters
|
2
|
+
# Ruby port of https://github.com/hwchase17/langchain/blob/763f87953686a69897d1f4d2260388b88eb8d670/langchain/text_splitter.py#L221
|
3
|
+
class RecursiveCharacterTextSplitter
|
4
|
+
def initialize(chunk_size:, chunk_overlap:)
|
5
|
+
@chunk_size = chunk_size
|
6
|
+
@chunk_overlap = chunk_overlap
|
7
|
+
@separators = ["\n\n", "\n", " ", ""]
|
8
|
+
end
|
9
|
+
|
10
|
+
def split(text)
|
11
|
+
output = []
|
12
|
+
good_splits = []
|
13
|
+
|
14
|
+
separator = @separators.last
|
15
|
+
@separators.each do |s|
|
16
|
+
if text.include?(s)
|
17
|
+
separator = s
|
18
|
+
break
|
19
|
+
end
|
20
|
+
end
|
21
|
+
splits = text.split(separator)
|
22
|
+
|
23
|
+
splits.each do |s|
|
24
|
+
if s.length < @chunk_size
|
25
|
+
good_splits << s
|
26
|
+
else
|
27
|
+
if good_splits.any?
|
28
|
+
merged_text = merge_splits(good_splits, separator)
|
29
|
+
output.concat(merged_text)
|
30
|
+
good_splits = []
|
31
|
+
end
|
32
|
+
|
33
|
+
other_info = split(s)
|
34
|
+
output.concat(other_info)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
if good_splits.any?
|
39
|
+
merged_text = merge_splits(good_splits, separator)
|
40
|
+
output.concat(merged_text)
|
41
|
+
end
|
42
|
+
|
43
|
+
output
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def merge_splits(splits, separator)
|
49
|
+
output = []
|
50
|
+
current_doc = []
|
51
|
+
total = 0
|
52
|
+
|
53
|
+
splits.each do |split|
|
54
|
+
if total + split.length >= @chunk_size && current_doc.any?
|
55
|
+
doc = current_doc.join(separator).strip
|
56
|
+
output << doc if doc && !doc.empty?
|
57
|
+
|
58
|
+
while total > @chunk_overlap || (total > 0 && (total + split.length > @chunk_size))
|
59
|
+
total -= current_doc.first.length
|
60
|
+
current_doc.shift
|
61
|
+
end
|
62
|
+
end
|
63
|
+
current_doc << split
|
64
|
+
total += split.length
|
65
|
+
end
|
66
|
+
doc = current_doc.join(separator).strip
|
67
|
+
output << doc if doc && !doc.empty?
|
68
|
+
|
69
|
+
output
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_splitters
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Alex Ghiculescu
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-03-05 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email:
|
15
|
+
- alex@tanda.co
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- LICENSE.txt
|
21
|
+
- README.md
|
22
|
+
- lib/text_splitters.rb
|
23
|
+
- lib/text_splitters/recursive_character_text_splitter.rb
|
24
|
+
- lib/text_splitters/version.rb
|
25
|
+
homepage: https://github.com/ghiculescu/text_splitters
|
26
|
+
licenses:
|
27
|
+
- MIT
|
28
|
+
metadata:
|
29
|
+
bug_tracker_uri: https://github.com/ghiculescu/text_splitters/issues
|
30
|
+
changelog_uri: https://github.com/ghiculescu/text_splitters/releases
|
31
|
+
source_code_uri: https://github.com/ghiculescu/text_splitters
|
32
|
+
homepage_uri: https://github.com/ghiculescu/text_splitters
|
33
|
+
rubygems_mfa_required: 'true'
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
require_paths:
|
37
|
+
- lib
|
38
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '2.6'
|
43
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
requirements: []
|
49
|
+
rubygems_version: 3.3.26
|
50
|
+
signing_key:
|
51
|
+
specification_version: 4
|
52
|
+
summary: Port of langchain's text splitters
|
53
|
+
test_files: []
|