baran 0.1.3 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9c113913bc9751c8dccc46fef4cf58b552e2ef629e0cbcbecf4009f1fb0c9665
4
- data.tar.gz: b166d1eab9681981435367ec57d24ce0dd12f45086f7654cfee0ffc6b120e2ec
3
+ metadata.gz: 8342de6adf861cc80d2e3b2b409e607b7b6a4a9eded4ce6218710ba9fa66d164
4
+ data.tar.gz: 2b50b7070a46aadc2f2a6727e9418134d6123dda81c5f3b9f21787e500142b7e
5
5
  SHA512:
6
- metadata.gz: 177b7eb08f07236e680316806fbf0f4fd2ef576ec36cf4005660503b594b90b023c1c29db54860b26990e65ea9ffa3b4d6dc94c913007b09cf018dd01d606754
7
- data.tar.gz: e7ec13bcadda327bb1ed691fc9bf55e8954f160bd33167549088baf92280a9e2b2c846450b15656034a46f3697f0c2513fee0ddda24528f56920c48606fb3e23
6
+ metadata.gz: '05859643d5c56270306611402878d0290658c327d086cc0fc3faf3098ada35a9efaa0a2bd027a848ef84157ba82014438e9ceab30139a2d56e1fe1223d8a4e5d'
7
+ data.tar.gz: b778120886153d0ad29eceb118534ef65ab68de7c6c956d8752d0c78b2aec947831b5a73ba75a202e2b8980a68c5f6d68bda69199b6be6f21657f3b6250b6941
data/CHANGELOG.md CHANGED
@@ -15,4 +15,13 @@
15
15
 
16
16
  ## [0.1.3] - 2023-05-28
17
17
 
18
- - Fix README
18
+ - Fix README
19
+
20
+ ## [0.1.4] - 2023-05-28
21
+
22
+ - Fix test
23
+ - Fix README
24
+
25
+ ## [0.1.5] - 2023-06-02
26
+
27
+ - Refactor
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- baran (0.1.3)
4
+ baran (0.1.5)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -41,7 +41,7 @@ Splitting by the specified characters recursively.
41
41
 
42
42
  ```ruby
43
43
  splitter = Baran::RecursiveCharacterTextSplitter.new(
44
- separators: ["\nn", "\n", " ", ""]
44
+ separators: ["\n\n", "\n", " ", ""]
45
45
  )
46
46
  splitter.chunks(text)
47
47
  # => [{ cursor: 0, text: "..." }, ...]
@@ -2,13 +2,15 @@ require_relative './text_splitter'
2
2
 
3
3
  module Baran
4
4
  class CharacterTextSplitter < TextSplitter
5
+ attr_accessor :separator
6
+
5
7
  def initialize(chunk_size: 1024, chunk_overlap: 64, separator: nil)
6
8
  super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
7
9
  @separator = separator || "\n\n"
8
10
  end
9
11
 
10
12
  def splitted(text)
11
- splits = @separator.empty? ? text.chars : text.split(@separator)
13
+ splits = separator.empty? ? text.chars : text.split(separator)
12
14
  merged(splits, @separator)
13
15
  end
14
16
  end
@@ -6,46 +6,38 @@ module Baran
6
6
 
7
7
  def initialize(chunk_size: 1024, chunk_overlap: 64, separators: nil)
8
8
  super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
9
- @separators = separators || ["\n\n", "\n", " ", ""]
9
+ @separators = separators || ["\n\n", "\n", " "]
10
10
  end
11
11
 
12
12
  def splitted(text)
13
- final_chunks = []
13
+ results = []
14
+ good_splits = []
15
+ separator = ''
14
16
 
15
- separator = @separators.last
16
- @separators.each do |s|
17
- if s.empty?
18
- separator = s
19
- break
20
- elsif text.include?(s)
17
+ separators.each do |s|
18
+ if text.include?(s)
21
19
  separator = s
22
20
  break
23
21
  end
24
22
  end
25
23
 
26
- splits = separator.empty? ? text.chars : text.split(separator)
27
-
28
- good_splits = []
29
- splits.each do |s|
30
- if s.length < @chunk_size
24
+ text.split(separator).each do |s|
25
+ if s.length < chunk_size
31
26
  good_splits << s
32
27
  else
33
- unless good_splits.empty?
34
- merged_text = merged(good_splits, separator)
35
- final_chunks.concat(merged_text)
28
+ if good_splits.length.positive?
29
+ results += merged(good_splits, separator)
36
30
  good_splits.clear
37
31
  end
38
- other_info = splitted(s)
39
- final_chunks.concat(other_info)
32
+ results += splitted(s)
40
33
  end
41
34
  end
42
35
 
43
- unless good_splits.empty?
44
- merged_text = merged(good_splits, separator)
45
- final_chunks.concat(merged_text)
36
+ if good_splits.length.positive?
37
+ results += merged(good_splits, separator)
46
38
  end
47
39
 
48
- final_chunks
40
+ results
49
41
  end
50
42
  end
51
43
  end
@@ -24,39 +24,33 @@ module Baran
24
24
  chunks
25
25
  end
26
26
 
27
- def join_docs(docs, separator)
28
- text = docs.join(separator).strip
27
+ def joined(items, separator)
28
+ text = items.join(separator).strip
29
29
  text.empty? ? nil : text
30
30
  end
31
31
 
32
32
  def merged(splits, separator)
33
- docs = [] # Array of strings
34
- current_doc = [] # Array of strings
33
+ results = [] # Array of strings
34
+ current_splits = [] # Array of strings
35
35
  total = 0
36
36
 
37
- splits.each do |d|
38
- len = d.length
37
+ splits.each do |split|
38
+ if total + split.length >= chunk_size && current_splits.length.positive?
39
+ results << joined(current_splits, separator)
39
40
 
40
- if total + len >= @chunk_size
41
- unless current_doc.empty?
42
- doc = join_docs(current_doc, separator)
43
- docs.push(doc) unless doc.nil?
44
-
45
- while total > @chunk_overlap || (total + len > @chunk_size && total.positive?)
46
- total -= current_doc.first.length
47
- current_doc.shift
48
- end
41
+ while total > chunk_overlap || (total + split.length > chunk_size && total.positive?)
42
+ total -= current_splits.first.length
43
+ current_splits.shift
49
44
  end
50
45
  end
51
46
 
52
- current_doc.push(d)
53
- total += len
47
+ current_splits << split
48
+ total += split.length
54
49
  end
55
50
 
56
- doc = join_docs(current_doc, separator)
57
- docs.push(doc) unless doc.nil?
51
+ results << joined(current_splits, separator)
58
52
 
59
- docs
53
+ results
60
54
  end
61
55
  end
62
56
  end
data/lib/baran/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Baran
4
- VERSION = "0.1.3"
4
+ VERSION = "0.1.5"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baran
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moeki Kawakami
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-28 00:00:00.000000000 Z
11
+ date: 2023-06-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Text Splitter for Large Language Model Datasets.
14
14
  email: