baran 0.1.12 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 27ec927ed0af9bafe1a15e2844abcda91966b53aab50d3c4e34ba03638200ba3
4
- data.tar.gz: 8746c0aae319f0bd925855e8025715906e88036a70839313fe60e62e945db3c7
3
+ metadata.gz: 76dee67df26935789dbd26ee97d603c2f1fa96a49eff4466584016cf03e75d48
4
+ data.tar.gz: ea8899d24cd2b5177c76623cbfb705b49414a6af01e21c77d1596eb7f8ac845e
5
5
  SHA512:
6
- metadata.gz: eefdb623105249441368ba3a88eb3c958a4046d371c8cef21ad7960cf1609d6cfec218d56c11faa929db2ee2b216e86cc2fd53e0bd5ad5863be536126b490258
7
- data.tar.gz: af48e71277ab4b28a81b92ffb733c28f55f66f17a6741862fc954a5511ac1c7250dc07ac9b9d726974ce8747aa4b7b168eeb594820503a3388c63dbc994decfb
6
+ metadata.gz: d1c2abb85d19c69087ec5247eed4769b8e063cef939d653d2d977b318cf838b6991508091105dca6714fb1493d9a73b2859f7a9193a320357683aab2f61ab1df
7
+ data.tar.gz: 6dd5c438d0d159fa52dec486013fe08cf9037bf9e250fd776af63be8be1f22eb8171f01b45f019c632960f790bdf90965c8c313d119636e0de0fca72b875726d
data/Gemfile CHANGED
@@ -5,6 +5,6 @@ source "https://rubygems.org"
5
5
  # Specify your gem's dependencies in baran.gemspec
6
6
  gemspec
7
7
 
8
- gem "minitest", "~> 5.21"
8
+ gem "minitest", "~> 5.22"
9
9
 
10
10
  gem "rake", "~> 13.1"
data/Gemfile.lock CHANGED
@@ -1,12 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- baran (0.1.12)
4
+ baran (0.2.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
- minitest (5.21.2)
9
+ minitest (5.22.3)
10
10
  rake (13.1.0)
11
11
 
12
12
  PLATFORMS
@@ -15,7 +15,7 @@ PLATFORMS
15
15
 
16
16
  DEPENDENCIES
17
17
  baran!
18
- minitest (~> 5.21)
18
+ minitest (~> 5.22)
19
19
  rake (~> 13.1)
20
20
 
21
21
  BUNDLED WITH
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ![v](https://badgen.net/rubygems/v/baran)
4
4
  ![dt](https://badgen.net/rubygems/dt/baran)
5
- ![license](https://badgen.net/github/license/moekidev/baran)
5
+ ![license](https://badgen.net/github/license/kawakamimoeki/baran)
6
6
 
7
7
  Text Splitter for Large Language Model datasets.
8
8
 
@@ -88,7 +88,7 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
88
88
 
89
89
  ## Contributing
90
90
 
91
- Bug reports and pull requests are welcome on GitHub at https://github.com/moekidev/baran. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/moekidev/baran/blob/main/CODE_OF_CONDUCT.md).
91
+ Bug reports and pull requests are welcome on GitHub at https://github.com/kawakamimoeki/baran. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
92
92
 
93
93
  ## License
94
94
 
@@ -96,4 +96,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
96
96
 
97
97
  ## Code of Conduct
98
98
 
99
- Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/moekidev/baran/blob/main/CODE_OF_CONDUCT.md).
99
+ Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Baran
4
+ class SentenceTextSplitter < TextSplitter
5
+ def initialize(chunk_size: 1024, chunk_overlap: 64)
6
+ super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
7
+ end
8
+
9
+ def splitted(text)
10
+ # Use a regex to split text based on the specified sentence-ending characters followed by whitespace
11
+ text.scan(/[^.!?]+[.!?]+(?:\s+)/).map(&:strip)
12
+ end
13
+ end
14
+ end
data/lib/baran/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Baran
4
- VERSION = "0.1.12"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/baran.rb CHANGED
@@ -5,6 +5,7 @@ require_relative "baran/text_splitter"
5
5
  require_relative "baran/markdown_splitter"
6
6
  require_relative "baran/recursive_character_text_splitter"
7
7
  require_relative "baran/character_text_splitter"
8
+ require_relative "baran/sentence_text_splitter"
8
9
 
9
10
  module Baran
10
11
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baran
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.12
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moeki Kawakami
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-31 00:00:00.000000000 Z
11
+ date: 2024-09-23 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Text Splitter for Large Language Model Datasets.
14
14
  email:
@@ -30,6 +30,7 @@ files:
30
30
  - lib/baran/character_text_splitter.rb
31
31
  - lib/baran/markdown_splitter.rb
32
32
  - lib/baran/recursive_character_text_splitter.rb
33
+ - lib/baran/sentence_text_splitter.rb
33
34
  - lib/baran/text_splitter.rb
34
35
  - lib/baran/version.rb
35
36
  - sig/baran.rbs