baran 0.1.11 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -2
- data/Gemfile.lock +5 -5
- data/README.md +3 -3
- data/lib/baran/sentence_text_splitter.rb +14 -0
- data/lib/baran/text_splitter.rb +1 -1
- data/lib/baran/version.rb +1 -1
- data/lib/baran.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 76dee67df26935789dbd26ee97d603c2f1fa96a49eff4466584016cf03e75d48
|
4
|
+
data.tar.gz: ea8899d24cd2b5177c76623cbfb705b49414a6af01e21c77d1596eb7f8ac845e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d1c2abb85d19c69087ec5247eed4769b8e063cef939d653d2d977b318cf838b6991508091105dca6714fb1493d9a73b2859f7a9193a320357683aab2f61ab1df
|
7
|
+
data.tar.gz: 6dd5c438d0d159fa52dec486013fe08cf9037bf9e250fd776af63be8be1f22eb8171f01b45f019c632960f790bdf90965c8c313d119636e0de0fca72b875726d
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
baran (0.
|
4
|
+
baran (0.2.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
minitest (5.
|
10
|
-
rake (13.0
|
9
|
+
minitest (5.22.3)
|
10
|
+
rake (13.1.0)
|
11
11
|
|
12
12
|
PLATFORMS
|
13
13
|
arm64-darwin-22
|
@@ -15,8 +15,8 @@ PLATFORMS
|
|
15
15
|
|
16
16
|
DEPENDENCIES
|
17
17
|
baran!
|
18
|
-
minitest (~> 5.
|
19
|
-
rake (~> 13.
|
18
|
+
minitest (~> 5.22)
|
19
|
+
rake (~> 13.1)
|
20
20
|
|
21
21
|
BUNDLED WITH
|
22
22
|
2.4.7
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|

|
4
4
|

|
5
|
-

|
6
6
|
|
7
7
|
Text Splitter for Large Language Model datasets.
|
8
8
|
|
@@ -88,7 +88,7 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
88
88
|
|
89
89
|
## Contributing
|
90
90
|
|
91
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
91
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/kawakamimoeki/baran. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
|
92
92
|
|
93
93
|
## License
|
94
94
|
|
@@ -96,4 +96,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
|
|
96
96
|
|
97
97
|
## Code of Conduct
|
98
98
|
|
99
|
-
Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/
|
99
|
+
Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Baran
|
4
|
+
class SentenceTextSplitter < TextSplitter
|
5
|
+
def initialize(chunk_size: 1024, chunk_overlap: 64)
|
6
|
+
super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
|
7
|
+
end
|
8
|
+
|
9
|
+
def splitted(text)
|
10
|
+
# Use a regex to split text based on the specified sentence-ending characters followed by whitespace
|
11
|
+
text.scan(/[^.!?]+[.!?]+(?:\s+)/).map(&:strip)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/baran/text_splitter.rb
CHANGED
data/lib/baran/version.rb
CHANGED
data/lib/baran.rb
CHANGED
@@ -5,6 +5,7 @@ require_relative "baran/text_splitter"
|
|
5
5
|
require_relative "baran/markdown_splitter"
|
6
6
|
require_relative "baran/recursive_character_text_splitter"
|
7
7
|
require_relative "baran/character_text_splitter"
|
8
|
+
require_relative "baran/sentence_text_splitter"
|
8
9
|
|
9
10
|
module Baran
|
10
11
|
class Error < StandardError; end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baran
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Moeki Kawakami
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Text Splitter for Large Language Model Datasets.
|
14
14
|
email:
|
@@ -30,6 +30,7 @@ files:
|
|
30
30
|
- lib/baran/character_text_splitter.rb
|
31
31
|
- lib/baran/markdown_splitter.rb
|
32
32
|
- lib/baran/recursive_character_text_splitter.rb
|
33
|
+
- lib/baran/sentence_text_splitter.rb
|
33
34
|
- lib/baran/text_splitter.rb
|
34
35
|
- lib/baran/version.rb
|
35
36
|
- sig/baran.rbs
|