baran 0.1.12 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +3 -3
- data/README.md +3 -3
- data/lib/baran/sentence_text_splitter.rb +14 -0
- data/lib/baran/version.rb +1 -1
- data/lib/baran.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 76dee67df26935789dbd26ee97d603c2f1fa96a49eff4466584016cf03e75d48
|
4
|
+
data.tar.gz: ea8899d24cd2b5177c76623cbfb705b49414a6af01e21c77d1596eb7f8ac845e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d1c2abb85d19c69087ec5247eed4769b8e063cef939d653d2d977b318cf838b6991508091105dca6714fb1493d9a73b2859f7a9193a320357683aab2f61ab1df
|
7
|
+
data.tar.gz: 6dd5c438d0d159fa52dec486013fe08cf9037bf9e250fd776af63be8be1f22eb8171f01b45f019c632960f790bdf90965c8c313d119636e0de0fca72b875726d
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
baran (0.
|
4
|
+
baran (0.2.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
minitest (5.
|
9
|
+
minitest (5.22.3)
|
10
10
|
rake (13.1.0)
|
11
11
|
|
12
12
|
PLATFORMS
|
@@ -15,7 +15,7 @@ PLATFORMS
|
|
15
15
|
|
16
16
|
DEPENDENCIES
|
17
17
|
baran!
|
18
|
-
minitest (~> 5.
|
18
|
+
minitest (~> 5.22)
|
19
19
|
rake (~> 13.1)
|
20
20
|
|
21
21
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
![v](https://badgen.net/rubygems/v/baran)
|
4
4
|
![dt](https://badgen.net/rubygems/dt/baran)
|
5
|
-
![license](https://badgen.net/github/license/
|
5
|
+
![license](https://badgen.net/github/license/kawakamimoeki/baran)
|
6
6
|
|
7
7
|
Text Splitter for Large Language Model datasets.
|
8
8
|
|
@@ -88,7 +88,7 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
88
88
|
|
89
89
|
## Contributing
|
90
90
|
|
91
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
91
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/kawakamimoeki/baran. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
|
92
92
|
|
93
93
|
## License
|
94
94
|
|
@@ -96,4 +96,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
|
|
96
96
|
|
97
97
|
## Code of Conduct
|
98
98
|
|
99
|
-
Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/
|
99
|
+
Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Baran
|
4
|
+
class SentenceTextSplitter < TextSplitter
|
5
|
+
def initialize(chunk_size: 1024, chunk_overlap: 64)
|
6
|
+
super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
|
7
|
+
end
|
8
|
+
|
9
|
+
def splitted(text)
|
10
|
+
# Use a regex to split text based on the specified sentence-ending characters followed by whitespace
|
11
|
+
text.scan(/[^.!?]+[.!?]+(?:\s+)/).map(&:strip)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/baran/version.rb
CHANGED
data/lib/baran.rb
CHANGED
@@ -5,6 +5,7 @@ require_relative "baran/text_splitter"
|
|
5
5
|
require_relative "baran/markdown_splitter"
|
6
6
|
require_relative "baran/recursive_character_text_splitter"
|
7
7
|
require_relative "baran/character_text_splitter"
|
8
|
+
require_relative "baran/sentence_text_splitter"
|
8
9
|
|
9
10
|
module Baran
|
10
11
|
class Error < StandardError; end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baran
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Moeki Kawakami
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Text Splitter for Large Language Model Datasets.
|
14
14
|
email:
|
@@ -30,6 +30,7 @@ files:
|
|
30
30
|
- lib/baran/character_text_splitter.rb
|
31
31
|
- lib/baran/markdown_splitter.rb
|
32
32
|
- lib/baran/recursive_character_text_splitter.rb
|
33
|
+
- lib/baran/sentence_text_splitter.rb
|
33
34
|
- lib/baran/text_splitter.rb
|
34
35
|
- lib/baran/version.rb
|
35
36
|
- sig/baran.rbs
|