baran 0.1.11 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 95a0c57558fc237d12ab005d24e444381725f8ddbb1fbbfe1ee730f9e14384ba
4
- data.tar.gz: a7eacd8e62b27478df98aaaf534a6b37348696d2baa78e67c5781891c1b74c03
3
+ metadata.gz: 76dee67df26935789dbd26ee97d603c2f1fa96a49eff4466584016cf03e75d48
4
+ data.tar.gz: ea8899d24cd2b5177c76623cbfb705b49414a6af01e21c77d1596eb7f8ac845e
5
5
  SHA512:
6
- metadata.gz: f09dd858f1dee1189ee543b440e8196871129fe33b6558762aedc85a1c9a26aebb1e439ff7c9b9e52bb0d92f6b1ffff37207105a146928c9350b26ed949019cd
7
- data.tar.gz: 59ec6d83b1b7ce85e005dee095e8baf087ffffef750851a661957b431ef346521d75661cd266cf81e889eae64966b4876b3b8031ce43987fead5c2678437aace
6
+ metadata.gz: d1c2abb85d19c69087ec5247eed4769b8e063cef939d653d2d977b318cf838b6991508091105dca6714fb1493d9a73b2859f7a9193a320357683aab2f61ab1df
7
+ data.tar.gz: 6dd5c438d0d159fa52dec486013fe08cf9037bf9e250fd776af63be8be1f22eb8171f01b45f019c632960f790bdf90965c8c313d119636e0de0fca72b875726d
data/Gemfile CHANGED
@@ -5,6 +5,6 @@ source "https://rubygems.org"
5
5
  # Specify your gem's dependencies in baran.gemspec
6
6
  gemspec
7
7
 
8
- gem "minitest", "~> 5.21"
8
+ gem "minitest", "~> 5.22"
9
9
 
10
- gem "rake", "~> 13.0"
10
+ gem "rake", "~> 13.1"
data/Gemfile.lock CHANGED
@@ -1,13 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- baran (0.1.11)
4
+ baran (0.2.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
- minitest (5.21.2)
10
- rake (13.0.6)
9
+ minitest (5.22.3)
10
+ rake (13.1.0)
11
11
 
12
12
  PLATFORMS
13
13
  arm64-darwin-22
@@ -15,8 +15,8 @@ PLATFORMS
15
15
 
16
16
  DEPENDENCIES
17
17
  baran!
18
- minitest (~> 5.21)
19
- rake (~> 13.0)
18
+ minitest (~> 5.22)
19
+ rake (~> 13.1)
20
20
 
21
21
  BUNDLED WITH
22
22
  2.4.7
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ![v](https://badgen.net/rubygems/v/baran)
4
4
  ![dt](https://badgen.net/rubygems/dt/baran)
5
- ![license](https://badgen.net/github/license/moekidev/baran)
5
+ ![license](https://badgen.net/github/license/kawakamimoeki/baran)
6
6
 
7
7
  Text Splitter for Large Language Model datasets.
8
8
 
@@ -88,7 +88,7 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
88
88
 
89
89
  ## Contributing
90
90
 
91
- Bug reports and pull requests are welcome on GitHub at https://github.com/moekidev/baran. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/moekidev/baran/blob/main/CODE_OF_CONDUCT.md).
91
+ Bug reports and pull requests are welcome on GitHub at https://github.com/kawakamimoeki/baran. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
92
92
 
93
93
  ## License
94
94
 
@@ -96,4 +96,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
96
96
 
97
97
  ## Code of Conduct
98
98
 
99
- Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/moekidev/baran/blob/main/CODE_OF_CONDUCT.md).
99
+ Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Baran
4
+ class SentenceTextSplitter < TextSplitter
5
+ def initialize(chunk_size: 1024, chunk_overlap: 64)
6
+ super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
7
+ end
8
+
9
+ def splitted(text)
10
+ # Use a regex to split text based on the specified sentence-ending characters followed by whitespace
11
+ text.scan(/[^.!?]+[.!?]+(?:\s+)/).map(&:strip)
12
+ end
13
+ end
14
+ end
@@ -22,7 +22,7 @@ module Baran
22
22
  chunk = { text: chunk, cursor: cursor }
23
23
  chunk[:metadata] = metadata if metadata
24
24
  chunks << chunk
25
- cursor += chunk.length
25
+ cursor += chunk[:text].length
26
26
  end
27
27
 
28
28
  chunks
data/lib/baran/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Baran
4
- VERSION = "0.1.11"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/baran.rb CHANGED
@@ -5,6 +5,7 @@ require_relative "baran/text_splitter"
5
5
  require_relative "baran/markdown_splitter"
6
6
  require_relative "baran/recursive_character_text_splitter"
7
7
  require_relative "baran/character_text_splitter"
8
+ require_relative "baran/sentence_text_splitter"
8
9
 
9
10
  module Baran
10
11
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baran
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moeki Kawakami
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-03-09 00:00:00.000000000 Z
11
+ date: 2024-09-23 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Text Splitter for Large Language Model Datasets.
14
14
  email:
@@ -30,6 +30,7 @@ files:
30
30
  - lib/baran/character_text_splitter.rb
31
31
  - lib/baran/markdown_splitter.rb
32
32
  - lib/baran/recursive_character_text_splitter.rb
33
+ - lib/baran/sentence_text_splitter.rb
33
34
  - lib/baran/text_splitter.rb
34
35
  - lib/baran/version.rb
35
36
  - sig/baran.rbs