baran 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9c113913bc9751c8dccc46fef4cf58b552e2ef629e0cbcbecf4009f1fb0c9665
4
- data.tar.gz: b166d1eab9681981435367ec57d24ce0dd12f45086f7654cfee0ffc6b120e2ec
3
+ metadata.gz: 8342de6adf861cc80d2e3b2b409e607b7b6a4a9eded4ce6218710ba9fa66d164
4
+ data.tar.gz: 2b50b7070a46aadc2f2a6727e9418134d6123dda81c5f3b9f21787e500142b7e
5
5
  SHA512:
6
- metadata.gz: 177b7eb08f07236e680316806fbf0f4fd2ef576ec36cf4005660503b594b90b023c1c29db54860b26990e65ea9ffa3b4d6dc94c913007b09cf018dd01d606754
7
- data.tar.gz: e7ec13bcadda327bb1ed691fc9bf55e8954f160bd33167549088baf92280a9e2b2c846450b15656034a46f3697f0c2513fee0ddda24528f56920c48606fb3e23
6
+ metadata.gz: '05859643d5c56270306611402878d0290658c327d086cc0fc3faf3098ada35a9efaa0a2bd027a848ef84157ba82014438e9ceab30139a2d56e1fe1223d8a4e5d'
7
+ data.tar.gz: b778120886153d0ad29eceb118534ef65ab68de7c6c956d8752d0c78b2aec947831b5a73ba75a202e2b8980a68c5f6d68bda69199b6be6f21657f3b6250b6941
data/CHANGELOG.md CHANGED
@@ -15,4 +15,13 @@
15
15
 
16
16
  ## [0.1.3] - 2023-05-28
17
17
 
18
- - Fix README
18
+ - Fix README
19
+
20
+ ## [0.1.4] - 2023-05-28
21
+
22
+ - Fix test
23
+ - Fix README
24
+
25
+ ## [0.1.5] - 2023-06-02
26
+
27
+ - Refactor
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- baran (0.1.3)
4
+ baran (0.1.5)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -41,7 +41,7 @@ Splitting by the specified characters recursively.
41
41
 
42
42
  ```ruby
43
43
  splitter = Baran::RecursiveCharacterTextSplitter.new(
44
- separators: ["\nn", "\n", " ", ""]
44
+ separators: ["\n\n", "\n", " ", ""]
45
45
  )
46
46
  splitter.chunks(text)
47
47
  # => [{ cursor: 0, text: "..." }, ...]
@@ -2,13 +2,15 @@ require_relative './text_splitter'
2
2
 
3
3
  module Baran
4
4
  class CharacterTextSplitter < TextSplitter
5
+ attr_accessor :separator
6
+
5
7
  def initialize(chunk_size: 1024, chunk_overlap: 64, separator: nil)
6
8
  super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
7
9
  @separator = separator || "\n\n"
8
10
  end
9
11
 
10
12
  def splitted(text)
11
- splits = @separator.empty? ? text.chars : text.split(@separator)
13
+ splits = separator.empty? ? text.chars : text.split(separator)
12
14
  merged(splits, @separator)
13
15
  end
14
16
  end
@@ -6,46 +6,38 @@ module Baran
6
6
 
7
7
  def initialize(chunk_size: 1024, chunk_overlap: 64, separators: nil)
8
8
  super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
9
- @separators = separators || ["\n\n", "\n", " ", ""]
9
+ @separators = separators || ["\n\n", "\n", " "]
10
10
  end
11
11
 
12
12
  def splitted(text)
13
- final_chunks = []
13
+ results = []
14
+ good_splits = []
15
+ separator = ''
14
16
 
15
- separator = @separators.last
16
- @separators.each do |s|
17
- if s.empty?
18
- separator = s
19
- break
20
- elsif text.include?(s)
17
+ separators.each do |s|
18
+ if text.include?(s)
21
19
  separator = s
22
20
  break
23
21
  end
24
22
  end
25
23
 
26
- splits = separator.empty? ? text.chars : text.split(separator)
27
-
28
- good_splits = []
29
- splits.each do |s|
30
- if s.length < @chunk_size
24
+ text.split(separator).each do |s|
25
+ if s.length < chunk_size
31
26
  good_splits << s
32
27
  else
33
- unless good_splits.empty?
34
- merged_text = merged(good_splits, separator)
35
- final_chunks.concat(merged_text)
28
+ if good_splits.length.positive?
29
+ results += merged(good_splits, separator)
36
30
  good_splits.clear
37
31
  end
38
- other_info = splitted(s)
39
- final_chunks.concat(other_info)
32
+ results += splitted(s)
40
33
  end
41
34
  end
42
35
 
43
- unless good_splits.empty?
44
- merged_text = merged(good_splits, separator)
45
- final_chunks.concat(merged_text)
36
+ if good_splits.length.positive?
37
+ results += merged(good_splits, separator)
46
38
  end
47
39
 
48
- final_chunks
40
+ results
49
41
  end
50
42
  end
51
43
  end
@@ -24,39 +24,33 @@ module Baran
24
24
  chunks
25
25
  end
26
26
 
27
- def join_docs(docs, separator)
28
- text = docs.join(separator).strip
27
+ def joined(items, separator)
28
+ text = items.join(separator).strip
29
29
  text.empty? ? nil : text
30
30
  end
31
31
 
32
32
  def merged(splits, separator)
33
- docs = [] # Array of strings
34
- current_doc = [] # Array of strings
33
+ results = [] # Array of strings
34
+ current_splits = [] # Array of strings
35
35
  total = 0
36
36
 
37
- splits.each do |d|
38
- len = d.length
37
+ splits.each do |split|
38
+ if total + split.length >= chunk_size && current_splits.length.positive?
39
+ results << joined(current_splits, separator)
39
40
 
40
- if total + len >= @chunk_size
41
- unless current_doc.empty?
42
- doc = join_docs(current_doc, separator)
43
- docs.push(doc) unless doc.nil?
44
-
45
- while total > @chunk_overlap || (total + len > @chunk_size && total.positive?)
46
- total -= current_doc.first.length
47
- current_doc.shift
48
- end
41
+ while total > chunk_overlap || (total + split.length > chunk_size && total.positive?)
42
+ total -= current_splits.first.length
43
+ current_splits.shift
49
44
  end
50
45
  end
51
46
 
52
- current_doc.push(d)
53
- total += len
47
+ current_splits << split
48
+ total += split.length
54
49
  end
55
50
 
56
- doc = join_docs(current_doc, separator)
57
- docs.push(doc) unless doc.nil?
51
+ results << joined(current_splits, separator)
58
52
 
59
- docs
53
+ results
60
54
  end
61
55
  end
62
56
  end
data/lib/baran/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Baran
4
- VERSION = "0.1.3"
4
+ VERSION = "0.1.5"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baran
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moeki Kawakami
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-28 00:00:00.000000000 Z
11
+ date: 2023-06-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Text Splitter for Large Language Model Datasets.
14
14
  email: