baran 0.1.4 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d64ad8b3b3b0f9d1bd32ce37453a6bf70581f5e229883b2d78b5200c236f01cc
4
- data.tar.gz: 703df213c544557522ac36ac16404d04f369fd24c03446928aed7c2cadca1be4
3
+ metadata.gz: 3c45d696e127a2bdc00606665d6585c74f0a82657bcce81d3e581fb9d5c3e692
4
+ data.tar.gz: 514c86c899e2a804d26b3bc3f6a24d6d35a8a7108a0450562e3050b3c798a3f0
5
5
  SHA512:
6
- metadata.gz: 471f2d07989fce4ec393eaff5dc995ae721902f607632f00816d7d8652a5740be13c75d5bae8e2d30028243de05c1eabac259806ff97758c6987efaf619873dc
7
- data.tar.gz: f74445571e79a792c9d485f06b943b4a84f8366e80f02036ef7be14b628a1e9124b385a39a543d5964490f52a00b234f29f349e2327744f07b20a19a5242d746
6
+ metadata.gz: 359d335677a6a7c08f31c4f2602915374a4ed09772d045b2d098713e1ae550e81660f762aa6aec8bb3457ef2498a0832e97b26f19b82b7c5a1548fb45b859250
7
+ data.tar.gz: 3a010e8aa5547283641f8069a93e98cb2032a191411ec355398e00b4c83ce0953295c8ffea53e54c9240e2ae6edaa83d330f2d2f84dd3066a48438b9b70b223e
data/CHANGELOG.md CHANGED
@@ -20,4 +20,8 @@
20
20
  ## [0.1.4] - 2023-05-28
21
21
 
22
22
  - Fix test
23
- - Fix README
23
+ - Fix README
24
+
25
+ ## [0.1.5] - 2023-06-02
26
+
27
+ - Refactor
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- baran (0.1.4)
4
+ baran (0.1.6)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -2,13 +2,15 @@ require_relative './text_splitter'
2
2
 
3
3
  module Baran
4
4
  class CharacterTextSplitter < TextSplitter
5
+ attr_accessor :separator
6
+
5
7
  def initialize(chunk_size: 1024, chunk_overlap: 64, separator: nil)
6
8
  super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
7
9
  @separator = separator || "\n\n"
8
10
  end
9
11
 
10
12
  def splitted(text)
11
- splits = @separator.empty? ? text.chars : text.split(@separator)
13
+ splits = separator.empty? ? text.chars : text.split(separator)
12
14
  merged(splits, @separator)
13
15
  end
14
16
  end
@@ -6,46 +6,38 @@ module Baran
6
6
 
7
7
  def initialize(chunk_size: 1024, chunk_overlap: 64, separators: nil)
8
8
  super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
9
- @separators = separators || ["\n\n", "\n", " ", ""]
9
+ @separators = separators || ["\n\n", "\n", " "]
10
10
  end
11
11
 
12
12
  def splitted(text)
13
- final_chunks = []
13
+ results = []
14
+ good_splits = []
15
+ separator = ''
14
16
 
15
- separator = @separators.last
16
- @separators.each do |s|
17
- if s.empty?
18
- separator = s
19
- break
20
- elsif text.include?(s)
17
+ separators.each do |s|
18
+ if text.include?(s)
21
19
  separator = s
22
20
  break
23
21
  end
24
22
  end
25
23
 
26
- splits = separator.empty? ? text.chars : text.split(separator)
27
-
28
- good_splits = []
29
- splits.each do |s|
30
- if s.length < @chunk_size
24
+ text.split(separator).each do |s|
25
+ if s.length < chunk_size
31
26
  good_splits << s
32
27
  else
33
- unless good_splits.empty?
34
- merged_text = merged(good_splits, separator)
35
- final_chunks.concat(merged_text)
28
+ if good_splits.length.positive?
29
+ results += merged(good_splits, separator)
36
30
  good_splits.clear
37
31
  end
38
- other_info = splitted(s)
39
- final_chunks.concat(other_info)
32
+ results += splitted(s)
40
33
  end
41
34
  end
42
35
 
43
- unless good_splits.empty?
44
- merged_text = merged(good_splits, separator)
45
- final_chunks.concat(merged_text)
36
+ if good_splits.length.positive?
37
+ results += merged(good_splits, separator)
46
38
  end
47
39
 
48
- final_chunks
40
+ results
49
41
  end
50
42
  end
51
43
  end
@@ -24,39 +24,33 @@ module Baran
24
24
  chunks
25
25
  end
26
26
 
27
- def join_docs(docs, separator)
28
- text = docs.join(separator).strip
27
+ def joined(items, separator)
28
+ text = items.join(separator).strip
29
29
  text.empty? ? nil : text
30
30
  end
31
31
 
32
32
  def merged(splits, separator)
33
- docs = [] # Array of strings
34
- current_doc = [] # Array of strings
33
+ results = [] # Array of strings
34
+ current_splits = [] # Array of strings
35
35
  total = 0
36
36
 
37
- splits.each do |d|
38
- len = d.length
37
+ splits.each do |split|
38
+ if total + split.length >= chunk_size && current_splits.length.positive?
39
+ results << joined(current_splits, separator)
39
40
 
40
- if total + len >= @chunk_size
41
- unless current_doc.empty?
42
- doc = join_docs(current_doc, separator)
43
- docs.push(doc) unless doc.nil?
44
-
45
- while total > @chunk_overlap || (total + len > @chunk_size && total.positive?)
46
- total -= current_doc.first.length
47
- current_doc.shift
48
- end
41
+ while total > chunk_overlap || (total + split.length >= chunk_size && total.positive?)
42
+ total -= current_splits.first.length
43
+ current_splits.shift
49
44
  end
50
45
  end
51
46
 
52
- current_doc.push(d)
53
- total += len
47
+ current_splits << split
48
+ total += split.length
54
49
  end
55
50
 
56
- doc = join_docs(current_doc, separator)
57
- docs.push(doc) unless doc.nil?
51
+ results << joined(current_splits, separator)
58
52
 
59
- docs
53
+ results
60
54
  end
61
55
  end
62
56
  end
data/lib/baran/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Baran
4
- VERSION = "0.1.4"
4
+ VERSION = "0.1.6"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baran
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moeki Kawakami
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-28 00:00:00.000000000 Z
11
+ date: 2023-06-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Text Splitter for Large Language Model Datasets.
14
14
  email: