baran 0.1.6 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3c45d696e127a2bdc00606665d6585c74f0a82657bcce81d3e581fb9d5c3e692
4
- data.tar.gz: 514c86c899e2a804d26b3bc3f6a24d6d35a8a7108a0450562e3050b3c798a3f0
3
+ metadata.gz: cac2a83bc8fd8a978a3fdc706747ee8f023f3d143b7b6c67e22408ade389e4ef
4
+ data.tar.gz: b0f0fccc1e7e9a7872400d929fd4a1cb8db2ec6b18d092177d6973506a0caa61
5
5
  SHA512:
6
- metadata.gz: 359d335677a6a7c08f31c4f2602915374a4ed09772d045b2d098713e1ae550e81660f762aa6aec8bb3457ef2498a0832e97b26f19b82b7c5a1548fb45b859250
7
- data.tar.gz: 3a010e8aa5547283641f8069a93e98cb2032a191411ec355398e00b4c83ce0953295c8ffea53e54c9240e2ae6edaa83d330f2d2f84dd3066a48438b9b70b223e
6
+ metadata.gz: 22a196b7a71c83c362e207ae242d1c0863d2d26d9be82aa0e7c09baf193b69478f2ad78b7f6d952d724435afee0435ab922851685e6f78afc9ac1e2746ac8389
7
+ data.tar.gz: c2753c59488a1f9bc65bf6fb8b08395f1cb476053c0a108a7fb53f621ec3b95eeabe9527fd43b1f9420f4fe774057592be5cfb57b88248800b832fdfb0d85368
data/Gemfile CHANGED
@@ -5,6 +5,6 @@ source "https://rubygems.org"
5
5
  # Specify your gem's dependencies in baran.gemspec
6
6
  gemspec
7
7
 
8
- gem "minitest", "~> 5.18"
8
+ gem "minitest", "~> 5.19"
9
9
 
10
10
  gem "rake", "~> 13.0"
data/Gemfile.lock CHANGED
@@ -1,12 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- baran (0.1.6)
4
+ baran (0.1.8)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
- minitest (5.18.0)
9
+ minitest (5.19.0)
10
10
  rake (13.0.6)
11
11
 
12
12
  PLATFORMS
@@ -15,7 +15,7 @@ PLATFORMS
15
15
 
16
16
  DEPENDENCIES
17
17
  baran!
18
- minitest (~> 5.18)
18
+ minitest (~> 5.19)
19
19
  rake (~> 13.0)
20
20
 
21
21
  BUNDLED WITH
data/README.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # Baran
2
2
 
3
+ ![v](https://badgen.net/rubygems/v/baran)
4
+ ![dt](https://badgen.net/rubygems/dt/baran)
5
+ ![license](https://badgen.net/github/license/moekidev/baran)
6
+
3
7
  Text Splitter for Large Language Model datasets.
4
8
 
5
9
  To avoid token constraints and improve the accuracy of vector search in the Large Language Model, it is necessary to divide the document. This gem supports splitting the text in the specified manner.
@@ -1,3 +1,5 @@
1
+ require 'logger'
2
+
1
3
  module Baran
2
4
  class TextSplitter
3
5
  attr_accessor :chunk_size, :chunk_overlap
@@ -16,7 +18,7 @@ module Baran
16
18
  cursor = 0
17
19
  chunks = []
18
20
 
19
- splitted(text).each do |chunk|
21
+ splitted(text).compact.each do |chunk|
20
22
  chunks << { text: chunk, cursor: cursor }
21
23
  cursor += chunk.length
22
24
  end
@@ -46,6 +48,7 @@ module Baran
46
48
 
47
49
  current_splits << split
48
50
  total += split.length
51
+ Logger.new(STDOUT).warn("Created a chunk of size #{total}, which is longer than the specified #{@chunk_size}") if total > @chunk_size
49
52
  end
50
53
 
51
54
  results << joined(current_splits, separator)
data/lib/baran/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Baran
4
- VERSION = "0.1.6"
4
+ VERSION = "0.1.8"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baran
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moeki Kawakami
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-15 00:00:00.000000000 Z
11
+ date: 2023-08-24 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Text Splitter for Large Language Model Datasets.
14
14
  email: