text_splitters 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7be4926126e20d571d6cdc3b3dd746b40e4831b0f57a618b5af24d9ac3da9ee9
4
+ data.tar.gz: 9a1e953226c677f60bb9485c5429ac25f359f82b3ff49e6a8ed18de9f3444ceb
5
+ SHA512:
6
+ metadata.gz: 25d6c946a4cf2e805b78a04b69a8700b5a89ada0be2802b90b876151831466b4241e17b34286d6eb9b5c9d766b7864bc1b81a9836ccdbd6a0bda2ee14e00a542
7
+ data.tar.gz: e60237fc48cbe8d399c201a99c0208af2ade1430bfb1243e75dcedcea2e6cd9f6d06639e84f90a7dfd368c8fbc315b90f76ca0ffc3face95575494c1308bae8e
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 Alex Ghiculescu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,59 @@
1
+ # text_splitters
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/text_splitters.svg)](https://rubygems.org/gems/text_splitters)
4
+ [![CI](https://github.com/ghiculescu/text_splitters/actions/workflows/ci.yml/badge.svg)](https://github.com/ghiculescu/text_splitters/actions/workflows/ci.yml)
5
+ [![Code Climate](https://codeclimate.com/github/ghiculescu/text_splitters/badges/gpa.svg)](https://codeclimate.com/github/ghiculescu/text_splitters)
6
+
7
+ Port of [langchain](https://github.com/hwchase17/langchain) text splitters to Ruby.
8
+
9
+ So far only the `RecursiveCharacterTextSplitter` is implemented. PRs for others are welcome!
10
+
11
+ ---
12
+
13
+ - [Quick start](#quick-start)
14
+ - [Support](#support)
15
+ - [License](#license)
16
+ - [Code of conduct](#code-of-conduct)
17
+ - [Contribution guide](#contribution-guide)
18
+
19
+ ## Quick start
20
+
21
+ ```
22
+ $ gem install text_splitters
23
+ ```
24
+
25
+ ```ruby
26
+ require "text_splitters"
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ### `RecursiveCharacterTextSplitter`
32
+
33
+ [Learn more about this splitter](https://langchain.readthedocs.io/en/latest/modules/indexes/examples/textsplitter.html#generic-recursive-text-splitting).
34
+
35
+ ```ruby
36
+ text = "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans."
37
+ splitter = ::TextSplitters::RecursiveCharacterTextSplitter.new(chunk_size: 100, chunk_overlap: 20)
38
+
39
+ output = splitter.split(text)
40
+
41
+ output[0] # "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet."
42
+ output[1] # "and the Cabinet. Justices of the Supreme Court. My fellow Americans."
43
+ ```
44
+
45
+ ## Support
46
+
47
+ If you want to report a bug, or have ideas, feedback or questions about the gem, [let me know via GitHub issues](https://github.com/ghiculescu/text_splitters/issues/new) and I will do my best to provide a helpful answer. Happy hacking!
48
+
49
+ ## License
50
+
51
+ The gem is available as open source under the terms of the [MIT License](LICENSE.txt).
52
+
53
+ ## Code of conduct
54
+
55
+ Everyone interacting in this project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
56
+
57
+ ## Contribution guide
58
+
59
+ Pull requests are welcome!
@@ -0,0 +1,72 @@
1
+ module TextSplitters
2
+ # Ruby port of https://github.com/hwchase17/langchain/blob/763f87953686a69897d1f4d2260388b88eb8d670/langchain/text_splitter.py#L221
3
+ class RecursiveCharacterTextSplitter
4
+ def initialize(chunk_size:, chunk_overlap:)
5
+ @chunk_size = chunk_size
6
+ @chunk_overlap = chunk_overlap
7
+ @separators = ["\n\n", "\n", " ", ""]
8
+ end
9
+
10
+ def split(text)
11
+ output = []
12
+ good_splits = []
13
+
14
+ separator = @separators.last
15
+ @separators.each do |s|
16
+ if text.include?(s)
17
+ separator = s
18
+ break
19
+ end
20
+ end
21
+ splits = text.split(separator)
22
+
23
+ splits.each do |s|
24
+ if s.length < @chunk_size
25
+ good_splits << s
26
+ else
27
+ if good_splits.any?
28
+ merged_text = merge_splits(good_splits, separator)
29
+ output.concat(merged_text)
30
+ good_splits = []
31
+ end
32
+
33
+ other_info = split(s)
34
+ output.concat(other_info)
35
+ end
36
+ end
37
+
38
+ if good_splits.any?
39
+ merged_text = merge_splits(good_splits, separator)
40
+ output.concat(merged_text)
41
+ end
42
+
43
+ output
44
+ end
45
+
46
+ private
47
+
48
+ def merge_splits(splits, separator)
49
+ output = []
50
+ current_doc = []
51
+ total = 0
52
+
53
+ splits.each do |split|
54
+ if total + split.length >= @chunk_size && current_doc.any?
55
+ doc = current_doc.join(separator).strip
56
+ output << doc if doc && !doc.empty?
57
+
58
+ while total > @chunk_overlap || (total > 0 && (total + split.length > @chunk_size))
59
+ total -= current_doc.first.length
60
+ current_doc.shift
61
+ end
62
+ end
63
+ current_doc << split
64
+ total += split.length
65
+ end
66
+ doc = current_doc.join(separator).strip
67
+ output << doc if doc && !doc.empty?
68
+
69
+ output
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,3 @@
1
+ module TextSplitters
2
+ VERSION = "1.0.0".freeze
3
+ end
@@ -0,0 +1,5 @@
1
+ module TextSplitters
2
+ autoload :VERSION, "text_splitters/version"
3
+
4
+ autoload :RecursiveCharacterTextSplitter, "text_splitters/recursive_character_text_splitter"
5
+ end
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_splitters
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Alex Ghiculescu
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-03-05 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email:
15
+ - alex@tanda.co
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - LICENSE.txt
21
+ - README.md
22
+ - lib/text_splitters.rb
23
+ - lib/text_splitters/recursive_character_text_splitter.rb
24
+ - lib/text_splitters/version.rb
25
+ homepage: https://github.com/ghiculescu/text_splitters
26
+ licenses:
27
+ - MIT
28
+ metadata:
29
+ bug_tracker_uri: https://github.com/ghiculescu/text_splitters/issues
30
+ changelog_uri: https://github.com/ghiculescu/text_splitters/releases
31
+ source_code_uri: https://github.com/ghiculescu/text_splitters
32
+ homepage_uri: https://github.com/ghiculescu/text_splitters
33
+ rubygems_mfa_required: 'true'
34
+ post_install_message:
35
+ rdoc_options: []
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '2.6'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirements: []
49
+ rubygems_version: 3.3.26
50
+ signing_key:
51
+ specification_version: 4
52
+ summary: Port of langchain's text splitters
53
+ test_files: []