text_splitters 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7be4926126e20d571d6cdc3b3dd746b40e4831b0f57a618b5af24d9ac3da9ee9
4
+ data.tar.gz: 9a1e953226c677f60bb9485c5429ac25f359f82b3ff49e6a8ed18de9f3444ceb
5
+ SHA512:
6
+ metadata.gz: 25d6c946a4cf2e805b78a04b69a8700b5a89ada0be2802b90b876151831466b4241e17b34286d6eb9b5c9d766b7864bc1b81a9836ccdbd6a0bda2ee14e00a542
7
+ data.tar.gz: e60237fc48cbe8d399c201a99c0208af2ade1430bfb1243e75dcedcea2e6cd9f6d06639e84f90a7dfd368c8fbc315b90f76ca0ffc3face95575494c1308bae8e
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 Alex Ghiculescu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,59 @@
1
+ # text_splitters
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/text_splitters.svg)](https://rubygems.org/gems/text_splitters)
4
+ [![CI](https://github.com/ghiculescu/text_splitters/actions/workflows/ci.yml/badge.svg)](https://github.com/ghiculescu/text_splitters/actions/workflows/ci.yml)
5
+ [![Code Climate](https://codeclimate.com/github/ghiculescu/text_splitters/badges/gpa.svg)](https://codeclimate.com/github/ghiculescu/text_splitters)
6
+
7
+ Port of [langchain](https://github.com/hwchase17/langchain) text splitters to Ruby.
8
+
9
+ So far only the `RecursiveCharacterTextSplitter` is implemented. PRs for others are welcome!
10
+
11
+ ---
12
+
13
+ - [Quick start](#quick-start)
14
+ - [Support](#support)
15
+ - [License](#license)
16
+ - [Code of conduct](#code-of-conduct)
17
+ - [Contribution guide](#contribution-guide)
18
+
19
+ ## Quick start
20
+
21
+ ```
22
+ $ gem install text_splitters
23
+ ```
24
+
25
+ ```ruby
26
+ require "text_splitters"
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ### `RecursiveCharacterTextSplitter`
32
+
33
+ [Learn more about this splitter](https://langchain.readthedocs.io/en/latest/modules/indexes/examples/textsplitter.html#generic-recursive-text-splitting).
34
+
35
+ ```ruby
36
+ text = "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans."
37
+ splitter = ::TextSplitters::RecursiveCharacterTextSplitter.new(chunk_size: 100, chunk_overlap: 20)
38
+
39
+ output = splitter.split(text)
40
+
41
+ output[0] # "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet."
42
+ output[1] # "and the Cabinet. Justices of the Supreme Court. My fellow Americans."
43
+ ```
44
+
45
+ ## Support
46
+
47
+ If you want to report a bug, or have ideas, feedback or questions about the gem, [let me know via GitHub issues](https://github.com/ghiculescu/text_splitters/issues/new) and I will do my best to provide a helpful answer. Happy hacking!
48
+
49
+ ## License
50
+
51
+ The gem is available as open source under the terms of the [MIT License](LICENSE.txt).
52
+
53
+ ## Code of conduct
54
+
55
+ Everyone interacting in this project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
56
+
57
+ ## Contribution guide
58
+
59
+ Pull requests are welcome!
@@ -0,0 +1,72 @@
1
+ module TextSplitters
2
+ # Ruby port of https://github.com/hwchase17/langchain/blob/763f87953686a69897d1f4d2260388b88eb8d670/langchain/text_splitter.py#L221
3
+ class RecursiveCharacterTextSplitter
4
+ def initialize(chunk_size:, chunk_overlap:)
5
+ @chunk_size = chunk_size
6
+ @chunk_overlap = chunk_overlap
7
+ @separators = ["\n\n", "\n", " ", ""]
8
+ end
9
+
10
+ def split(text)
11
+ output = []
12
+ good_splits = []
13
+
14
+ separator = @separators.last
15
+ @separators.each do |s|
16
+ if text.include?(s)
17
+ separator = s
18
+ break
19
+ end
20
+ end
21
+ splits = text.split(separator)
22
+
23
+ splits.each do |s|
24
+ if s.length < @chunk_size
25
+ good_splits << s
26
+ else
27
+ if good_splits.any?
28
+ merged_text = merge_splits(good_splits, separator)
29
+ output.concat(merged_text)
30
+ good_splits = []
31
+ end
32
+
33
+ other_info = split(s)
34
+ output.concat(other_info)
35
+ end
36
+ end
37
+
38
+ if good_splits.any?
39
+ merged_text = merge_splits(good_splits, separator)
40
+ output.concat(merged_text)
41
+ end
42
+
43
+ output
44
+ end
45
+
46
+ private
47
+
48
+ def merge_splits(splits, separator)
49
+ output = []
50
+ current_doc = []
51
+ total = 0
52
+
53
+ splits.each do |split|
54
+ if total + split.length >= @chunk_size && current_doc.any?
55
+ doc = current_doc.join(separator).strip
56
+ output << doc if doc && !doc.empty?
57
+
58
+ while total > @chunk_overlap || (total > 0 && (total + split.length > @chunk_size))
59
+ total -= current_doc.first.length
60
+ current_doc.shift
61
+ end
62
+ end
63
+ current_doc << split
64
+ total += split.length
65
+ end
66
+ doc = current_doc.join(separator).strip
67
+ output << doc if doc && !doc.empty?
68
+
69
+ output
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,3 @@
1
+ module TextSplitters
2
+ VERSION = "1.0.0".freeze
3
+ end
@@ -0,0 +1,5 @@
1
+ module TextSplitters
2
+ autoload :VERSION, "text_splitters/version"
3
+
4
+ autoload :RecursiveCharacterTextSplitter, "text_splitters/recursive_character_text_splitter"
5
+ end
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_splitters
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Alex Ghiculescu
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-03-05 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email:
15
+ - alex@tanda.co
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - LICENSE.txt
21
+ - README.md
22
+ - lib/text_splitters.rb
23
+ - lib/text_splitters/recursive_character_text_splitter.rb
24
+ - lib/text_splitters/version.rb
25
+ homepage: https://github.com/ghiculescu/text_splitters
26
+ licenses:
27
+ - MIT
28
+ metadata:
29
+ bug_tracker_uri: https://github.com/ghiculescu/text_splitters/issues
30
+ changelog_uri: https://github.com/ghiculescu/text_splitters/releases
31
+ source_code_uri: https://github.com/ghiculescu/text_splitters
32
+ homepage_uri: https://github.com/ghiculescu/text_splitters
33
+ rubygems_mfa_required: 'true'
34
+ post_install_message:
35
+ rdoc_options: []
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '2.6'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirements: []
49
+ rubygems_version: 3.3.26
50
+ signing_key:
51
+ specification_version: 4
52
+ summary: Port of langchain's text splitters
53
+ test_files: []