baran 0.1.11 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 95a0c57558fc237d12ab005d24e444381725f8ddbb1fbbfe1ee730f9e14384ba
4
- data.tar.gz: a7eacd8e62b27478df98aaaf534a6b37348696d2baa78e67c5781891c1b74c03
3
+ metadata.gz: 76dee67df26935789dbd26ee97d603c2f1fa96a49eff4466584016cf03e75d48
4
+ data.tar.gz: ea8899d24cd2b5177c76623cbfb705b49414a6af01e21c77d1596eb7f8ac845e
5
5
  SHA512:
6
- metadata.gz: f09dd858f1dee1189ee543b440e8196871129fe33b6558762aedc85a1c9a26aebb1e439ff7c9b9e52bb0d92f6b1ffff37207105a146928c9350b26ed949019cd
7
- data.tar.gz: 59ec6d83b1b7ce85e005dee095e8baf087ffffef750851a661957b431ef346521d75661cd266cf81e889eae64966b4876b3b8031ce43987fead5c2678437aace
6
+ metadata.gz: d1c2abb85d19c69087ec5247eed4769b8e063cef939d653d2d977b318cf838b6991508091105dca6714fb1493d9a73b2859f7a9193a320357683aab2f61ab1df
7
+ data.tar.gz: 6dd5c438d0d159fa52dec486013fe08cf9037bf9e250fd776af63be8be1f22eb8171f01b45f019c632960f790bdf90965c8c313d119636e0de0fca72b875726d
data/Gemfile CHANGED
@@ -5,6 +5,6 @@ source "https://rubygems.org"
5
5
  # Specify your gem's dependencies in baran.gemspec
6
6
  gemspec
7
7
 
8
- gem "minitest", "~> 5.21"
8
+ gem "minitest", "~> 5.22"
9
9
 
10
- gem "rake", "~> 13.0"
10
+ gem "rake", "~> 13.1"
data/Gemfile.lock CHANGED
@@ -1,13 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- baran (0.1.11)
4
+ baran (0.2.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
- minitest (5.21.2)
10
- rake (13.0.6)
9
+ minitest (5.22.3)
10
+ rake (13.1.0)
11
11
 
12
12
  PLATFORMS
13
13
  arm64-darwin-22
@@ -15,8 +15,8 @@ PLATFORMS
15
15
 
16
16
  DEPENDENCIES
17
17
  baran!
18
- minitest (~> 5.21)
19
- rake (~> 13.0)
18
+ minitest (~> 5.22)
19
+ rake (~> 13.1)
20
20
 
21
21
  BUNDLED WITH
22
22
  2.4.7
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ![v](https://badgen.net/rubygems/v/baran)
4
4
  ![dt](https://badgen.net/rubygems/dt/baran)
5
- ![license](https://badgen.net/github/license/moekidev/baran)
5
+ ![license](https://badgen.net/github/license/kawakamimoeki/baran)
6
6
 
7
7
  Text Splitter for Large Language Model datasets.
8
8
 
@@ -88,7 +88,7 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
88
88
 
89
89
  ## Contributing
90
90
 
91
- Bug reports and pull requests are welcome on GitHub at https://github.com/moekidev/baran. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/moekidev/baran/blob/main/CODE_OF_CONDUCT.md).
91
+ Bug reports and pull requests are welcome on GitHub at https://github.com/kawakamimoeki/baran. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
92
92
 
93
93
  ## License
94
94
 
@@ -96,4 +96,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
96
96
 
97
97
  ## Code of Conduct
98
98
 
99
- Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/moekidev/baran/blob/main/CODE_OF_CONDUCT.md).
99
+ Everyone interacting in the Baran project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/kawakamimoeki/baran/blob/main/CODE_OF_CONDUCT.md).
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Baran
4
+ class SentenceTextSplitter < TextSplitter
5
+ def initialize(chunk_size: 1024, chunk_overlap: 64)
6
+ super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
7
+ end
8
+
9
+ def splitted(text)
10
+ # Use a regex to split text based on the specified sentence-ending characters followed by whitespace
11
+ text.scan(/[^.!?]+[.!?]+(?:\s+)/).map(&:strip)
12
+ end
13
+ end
14
+ end
@@ -22,7 +22,7 @@ module Baran
22
22
  chunk = { text: chunk, cursor: cursor }
23
23
  chunk[:metadata] = metadata if metadata
24
24
  chunks << chunk
25
- cursor += chunk.length
25
+ cursor += chunk[:text].length
26
26
  end
27
27
 
28
28
  chunks
data/lib/baran/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Baran
4
- VERSION = "0.1.11"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/baran.rb CHANGED
@@ -5,6 +5,7 @@ require_relative "baran/text_splitter"
5
5
  require_relative "baran/markdown_splitter"
6
6
  require_relative "baran/recursive_character_text_splitter"
7
7
  require_relative "baran/character_text_splitter"
8
+ require_relative "baran/sentence_text_splitter"
8
9
 
9
10
  module Baran
10
11
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baran
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moeki Kawakami
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-03-09 00:00:00.000000000 Z
11
+ date: 2024-09-23 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Text Splitter for Large Language Model Datasets.
14
14
  email:
@@ -30,6 +30,7 @@ files:
30
30
  - lib/baran/character_text_splitter.rb
31
31
  - lib/baran/markdown_splitter.rb
32
32
  - lib/baran/recursive_character_text_splitter.rb
33
+ - lib/baran/sentence_text_splitter.rb
33
34
  - lib/baran/text_splitter.rb
34
35
  - lib/baran/version.rb
35
36
  - sig/baran.rbs