baran 0.1.9 → 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -2
- data/Gemfile +1 -1
- data/Gemfile.lock +3 -3
- data/README.md +6 -6
- data/lib/baran/markdown_splitter.rb +3 -3
- data/lib/baran/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 95a0c57558fc237d12ab005d24e444381725f8ddbb1fbbfe1ee730f9e14384ba
|
4
|
+
data.tar.gz: a7eacd8e62b27478df98aaaf534a6b37348696d2baa78e67c5781891c1b74c03
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f09dd858f1dee1189ee543b440e8196871129fe33b6558762aedc85a1c9a26aebb1e439ff7c9b9e52bb0d92f6b1ffff37207105a146928c9350b26ed949019cd
|
7
|
+
data.tar.gz: 59ec6d83b1b7ce85e005dee095e8baf087ffffef750851a661957b431ef346521d75661cd266cf81e889eae64966b4876b3b8031ce43987fead5c2678437aace
|
data/CHANGELOG.md
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
baran (0.1.
|
4
|
+
baran (0.1.11)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
minitest (5.
|
9
|
+
minitest (5.21.2)
|
10
10
|
rake (13.0.6)
|
11
11
|
|
12
12
|
PLATFORMS
|
@@ -15,7 +15,7 @@ PLATFORMS
|
|
15
15
|
|
16
16
|
DEPENDENCIES
|
17
17
|
baran!
|
18
|
-
minitest (~> 5.
|
18
|
+
minitest (~> 5.21)
|
19
19
|
rake (~> 13.0)
|
20
20
|
|
21
21
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -35,8 +35,8 @@ splitter = Baran::CharacterTextSplitter.new(
|
|
35
35
|
chunk_overlap: 64,
|
36
36
|
separator: "\n\n"
|
37
37
|
)
|
38
|
-
splitter.chunks(text)
|
39
|
-
# => [{ cursor: 0, text: "..." }, ...]
|
38
|
+
splitter.chunks(text, metadata: { ... })
|
39
|
+
# => [{ cursor: 0, text: "...", metadata: { ... } }, ...]
|
40
40
|
```
|
41
41
|
|
42
42
|
### Recursive Character Text Splitter
|
@@ -47,8 +47,8 @@ Splitting by the specified characters recursively.
|
|
47
47
|
splitter = Baran::RecursiveCharacterTextSplitter.new(
|
48
48
|
separators: ["\n\n", "\n", " ", ""]
|
49
49
|
)
|
50
|
-
splitter.chunks(text)
|
51
|
-
# => [{ cursor: 0, text: "..." }, ...]
|
50
|
+
splitter.chunks(text, metadata: { ... })
|
51
|
+
# => [{ cursor: 0, text: "...", metadata: { ... } }, ...]
|
52
52
|
```
|
53
53
|
|
54
54
|
### Markdown Text Splitter
|
@@ -57,8 +57,8 @@ Splitting by the Markdown descriptions.
|
|
57
57
|
|
58
58
|
```ruby
|
59
59
|
splitter = Baran::MarkdownSplitter.new
|
60
|
-
splitter.chunks(markdown)
|
61
|
-
# => [{ cursor: 0, text: "..." }, ...]
|
60
|
+
splitter.chunks(markdown, metadata: { ... })
|
61
|
+
# => [{ cursor: 0, text: "...", metadata: { ... } }, ...]
|
62
62
|
```
|
63
63
|
|
64
64
|
Split with the following priority.
|
@@ -3,7 +3,7 @@ require_relative './recursive_character_text_splitter'
|
|
3
3
|
module Baran
|
4
4
|
class MarkdownSplitter < RecursiveCharacterTextSplitter
|
5
5
|
def initialize(chunk_size: 1024, chunk_overlap: 64)
|
6
|
-
|
6
|
+
separators = [
|
7
7
|
"\n# ", # h1
|
8
8
|
"\n## ", # h2
|
9
9
|
"\n### ", # h3
|
@@ -19,7 +19,7 @@ module Baran
|
|
19
19
|
" ", # space
|
20
20
|
"" # empty
|
21
21
|
]
|
22
|
-
super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
|
22
|
+
super(chunk_size: chunk_size, chunk_overlap: chunk_overlap, separators: separators)
|
23
23
|
end
|
24
24
|
end
|
25
|
-
end
|
25
|
+
end
|
data/lib/baran/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baran
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Moeki Kawakami
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-03-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Text Splitter for Large Language Model Datasets.
|
14
14
|
email:
|