baran 0.1.9 → 0.1.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +6 -6
- data/lib/baran/markdown_splitter.rb +3 -3
- data/lib/baran/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6cd3f94a89b19931ba237a67a23119a23549a70a200d6074e072d4ccd84f459
|
4
|
+
data.tar.gz: e395826b7cde638a330dfbf8f66a0864053bf96cebd0c49127706d0b1556bee6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 421472e67d0279cac41ce4a36db642dadc3ca5b1b3f1f6cde6cb86e7d08038eec007e32b622e099fd6169a23cca2b25fd994598cfabd154901edfee74fe22ce4
|
7
|
+
data.tar.gz: 54e61bb7255e7269a06aa8df912d8a839c3c124dc4f2f73bba502070d07eafa3d04728848842b80005e4a0cfe3fcd0fb6244ee413ac1c6e283e87c2e2f21f451
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -35,8 +35,8 @@ splitter = Baran::CharacterTextSplitter.new(
|
|
35
35
|
chunk_overlap: 64,
|
36
36
|
separator: "\n\n"
|
37
37
|
)
|
38
|
-
splitter.chunks(text)
|
39
|
-
# => [{ cursor: 0, text: "..." }, ...]
|
38
|
+
splitter.chunks(text, metadata: { ... })
|
39
|
+
# => [{ cursor: 0, text: "...", metadata: { ... } }, ...]
|
40
40
|
```
|
41
41
|
|
42
42
|
### Recursive Character Text Splitter
|
@@ -47,8 +47,8 @@ Splitting by the specified characters recursively.
|
|
47
47
|
splitter = Baran::RecursiveCharacterTextSplitter.new(
|
48
48
|
separators: ["\n\n", "\n", " ", ""]
|
49
49
|
)
|
50
|
-
splitter.chunks(text)
|
51
|
-
# => [{ cursor: 0, text: "..." }, ...]
|
50
|
+
splitter.chunks(text, metadata: { ... })
|
51
|
+
# => [{ cursor: 0, text: "...", metadata: { ... } }, ...]
|
52
52
|
```
|
53
53
|
|
54
54
|
### Markdown Text Splitter
|
@@ -57,8 +57,8 @@ Splitting by the Markdown descriptions.
|
|
57
57
|
|
58
58
|
```ruby
|
59
59
|
splitter = Baran::MarkdownSplitter.new
|
60
|
-
splitter.chunks(markdown)
|
61
|
-
# => [{ cursor: 0, text: "..." }, ...]
|
60
|
+
splitter.chunks(markdown, metadata: { ... })
|
61
|
+
# => [{ cursor: 0, text: "...", metadata: { ... } }, ...]
|
62
62
|
```
|
63
63
|
|
64
64
|
Split with the following priority.
|
@@ -3,7 +3,7 @@ require_relative './recursive_character_text_splitter'
|
|
3
3
|
module Baran
|
4
4
|
class MarkdownSplitter < RecursiveCharacterTextSplitter
|
5
5
|
def initialize(chunk_size: 1024, chunk_overlap: 64)
|
6
|
-
|
6
|
+
separators = [
|
7
7
|
"\n# ", # h1
|
8
8
|
"\n## ", # h2
|
9
9
|
"\n### ", # h3
|
@@ -19,7 +19,7 @@ module Baran
|
|
19
19
|
" ", # space
|
20
20
|
"" # empty
|
21
21
|
]
|
22
|
-
super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
|
22
|
+
super(chunk_size: chunk_size, chunk_overlap: chunk_overlap, separators: separators)
|
23
23
|
end
|
24
24
|
end
|
25
|
-
end
|
25
|
+
end
|
data/lib/baran/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baran
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Moeki Kawakami
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Text Splitter for Large Language Model Datasets.
|
14
14
|
email:
|