kaba 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +9 -0
- data/Rakefile +4 -0
- data/exe/kaba +30 -0
- data/kaba.gemspec +42 -0
- data/lib/kaba/_DPodfile_ +51 -0
- data/lib/kaba/dataset.rb +68 -0
- data/lib/kaba/dataset_source.rb +41 -0
- data/lib/kaba/prompt.rb +26 -0
- data/lib/kaba/validate.rb +84 -0
- data/lib/kaba/version.rb +5 -0
- data/lib/kaba.rb +12 -0
- data/sig/kaba.rbs +4 -0
- metadata +128 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7d0d15c1454667339192ecb68a40a0128221f0e2f176c9668fdddbaf6507a143
|
4
|
+
data.tar.gz: 63d4b6088c24453874a2acf9a1c183b6d7468a0297c43cf32feccd0e838f84c0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0e937f2aea17053fd01b8a4b5f76f61d58d2817eed4e12942f4edaee0ef72b8cf2fc2c3febeb082d49131fad6aba495bfc7385d59bd13c127b8faaa0ad1a7719
|
7
|
+
data.tar.gz: 592a977e7ec2c0b80f76e9f4c9fa0875357466ab1e6cbb05e9dec0fdabe086d8f25fda39caad60d0334ed145a69483ef6ea091d25bc5c39299c9afc5380d6155
|
data/README.md
ADDED
data/Rakefile
ADDED
data/exe/kaba
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "bundler/setup"
|
3
|
+
|
4
|
+
require 'async'
|
5
|
+
require 'faraday'
|
6
|
+
require 'colorize'
|
7
|
+
require 'tty-progressbar'
|
8
|
+
require 'async/http/faraday'
|
9
|
+
|
10
|
+
require 'json'
|
11
|
+
require "kaba"
|
12
|
+
|
13
|
+
class Application
|
14
|
+
class << self
|
15
|
+
def connection
|
16
|
+
@connection ||= Faraday.new('https://lisa-typechat.listenai.com') do |faraday|
|
17
|
+
faraday.adapter :async_http, clients: Async::HTTP::Faraday::PersistentClients
|
18
|
+
faraday.request :json
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def ddbug
|
25
|
+
require 'irb'
|
26
|
+
binding.irb
|
27
|
+
end
|
28
|
+
|
29
|
+
# 运行 DPodfile 文件,DPodfile 是一个 Ruby 文件
|
30
|
+
load DatasetSource.podfile
|
data/kaba.gemspec
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/kaba/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "kaba"
|
7
|
+
spec.version = Kaba::VERSION
|
8
|
+
spec.authors = ["MJ"]
|
9
|
+
spec.email = ["tywf91@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "用来做数据集的工具"
|
12
|
+
spec.description = "用来做数据集的工具"
|
13
|
+
spec.homepage = "https://github.com/mjason/kaba.git"
|
14
|
+
spec.required_ruby_version = ">= 3.3.0"
|
15
|
+
|
16
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
|
17
|
+
|
18
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
19
|
+
spec.metadata["source_code_uri"] = "https://github.com/mjason/kaba.git"
|
20
|
+
spec.metadata["changelog_uri"] = "https://github.com/mjason/kaba.git"
|
21
|
+
|
22
|
+
# Specify which files should be added to the gem when it is released.
|
23
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
24
|
+
spec.files = Dir.chdir(__dir__) do
|
25
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
26
|
+
(File.expand_path(f) == __FILE__) ||
|
27
|
+
f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
|
28
|
+
end
|
29
|
+
end
|
30
|
+
spec.bindir = "exe"
|
31
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
32
|
+
spec.require_paths = ["lib"]
|
33
|
+
|
34
|
+
spec.add_dependency "async", "~> 2.20"
|
35
|
+
spec.add_dependency "faraday", "~> 2.12"
|
36
|
+
spec.add_dependency "async-http-faraday", "~> 0.19.0"
|
37
|
+
spec.add_dependency "colorize", "~> 1.1"
|
38
|
+
spec.add_dependency "tty-progressbar", "~> 0.18.3"
|
39
|
+
|
40
|
+
# For more information and examples about making a new gem, check out our
|
41
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
42
|
+
end
|
data/lib/kaba/_DPodfile_
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
## 使用 Ruby 语言编写的数据集校验脚本
|
2
|
+
# ddbug 断点调试
|
3
|
+
# 使用 colorize 来输出带颜色的信息,https://github.com/fazibear/colorize
|
4
|
+
# 使用 progressbar 来显示进度条,https://github.com/piotrmurach/tty-progressbar
|
5
|
+
# 设置数据集目录, 如果使用 Docker 方式运行,需要将数据集挂载到 /data 目录下,DatasetSource 会自动加载 /data 目录下的数据集
|
6
|
+
source = DatasetSource.new(File.join(__dir__, 'data'))
|
7
|
+
schema = source.schema.join('resume.ts').read
|
8
|
+
type_name = 'Resume'
|
9
|
+
prompt = Prompt.new(schema, type_name)
|
10
|
+
validate = Validate.new(schema: schema, type_name: type_name)
|
11
|
+
dataset = Dataset.new(source.row, prompt)
|
12
|
+
|
13
|
+
# #
|
14
|
+
# # 可以使用 run_file 来验证单个文件
|
15
|
+
# response = validate.run_file(source.row.join('1.target.json'))
|
16
|
+
# puts response
|
17
|
+
# #
|
18
|
+
|
19
|
+
## 一般来说直接 run_files 就可以了,支持 limit 来限制读取的文件数量
|
20
|
+
validate.run_files(source.row)
|
21
|
+
|
22
|
+
## 数据集处理部分
|
23
|
+
# 支持 limit 来限制读取的文件数量
|
24
|
+
dataset.scan()
|
25
|
+
dataset.save(source.join('train.jsonl'))
|
26
|
+
|
27
|
+
## 高级玩法,不要轻易尝试
|
28
|
+
#
|
29
|
+
### 可以加入 limit 来限制读取的文件数量,validate.run_files('./data/row', limit: 1) do |response, json, file|
|
30
|
+
# validate.run_files(source.row) do |response, progressbar|
|
31
|
+
# progressbar.log response.to_s unless response.success?
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
#
|
35
|
+
## 也支持 limit 来进行限制
|
36
|
+
# Async do
|
37
|
+
# dataset.each do |row, ds|
|
38
|
+
# Async do
|
39
|
+
# instruction = prompt.render(File.read row.input_file)
|
40
|
+
# target = <<~Markdown
|
41
|
+
# ```json
|
42
|
+
# #{JSON.pretty_generate(JSON.parse(File.read(row.target_path)))}
|
43
|
+
# ```
|
44
|
+
# Markdown
|
45
|
+
# ds.add({ instruction: instruction, output: target })
|
46
|
+
|
47
|
+
# instruction = prompt.render(File.read(row.input_file), export: true)
|
48
|
+
# ds.add({ instruction: instruction, output: target })
|
49
|
+
# end
|
50
|
+
# end
|
51
|
+
# end.wait
|
data/lib/kaba/dataset.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
class Dataset
|
2
|
+
attr_reader :lines
|
3
|
+
|
4
|
+
def initialize(data_dir, prompt)
|
5
|
+
@data_files = Dir.glob(File.join(File.expand_path(data_dir), '*.target.json'))
|
6
|
+
@lines = []
|
7
|
+
@prompt = prompt
|
8
|
+
end
|
9
|
+
|
10
|
+
## 实现一个 each 方法,可以让用户通过 block 的方式遍历数据集,提供一个 add 方法,可以将数据添加到数据集中
|
11
|
+
def _each(limit: nil)
|
12
|
+
@data_files.first(limit || @data_files.size).each do |file|
|
13
|
+
yield(Row.new(file), self)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def each(limit: nil, &block)
|
18
|
+
puts "Waring: each is very dangerous".colorize(:red)
|
19
|
+
_each(limit: limit, &block)
|
20
|
+
end
|
21
|
+
|
22
|
+
def add(data)
|
23
|
+
@lines << data
|
24
|
+
end
|
25
|
+
|
26
|
+
def save(file_path)
|
27
|
+
File.open(File.expand_path(file_path), 'w') do |file|
|
28
|
+
@lines.each do |line|
|
29
|
+
file.puts(line.to_json)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def validate
|
35
|
+
@lines.size == (@data_files.size * 2)
|
36
|
+
end
|
37
|
+
|
38
|
+
def scan(limit: nil)
|
39
|
+
progressbar = TTY::ProgressBar.new("Dataset: [:bar] :percent :current/:total", total: @data_files.size)
|
40
|
+
Async do
|
41
|
+
_each(limit: limit) do |row, ds|
|
42
|
+
Async do
|
43
|
+
instruction = @prompt.render(File.read row.input_file)
|
44
|
+
target = <<~Markdown
|
45
|
+
```json
|
46
|
+
#{JSON.pretty_generate(JSON.parse(File.read(row.target_path)))}
|
47
|
+
```
|
48
|
+
Markdown
|
49
|
+
ds.add({ instruction: instruction, output: target })
|
50
|
+
|
51
|
+
instruction = @prompt.render(File.read(row.input_file), export: true)
|
52
|
+
ds.add({ instruction: instruction, output: target })
|
53
|
+
|
54
|
+
progressbar.advance
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end.wait
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
class Row
|
63
|
+
attr_reader :target_path, :input_file
|
64
|
+
def initialize(file)
|
65
|
+
@target_path = File.expand_path(file)
|
66
|
+
@input_file = @target_path.sub(/\.target\.json$/, '.input.txt')
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
class DatasetSource
|
2
|
+
|
3
|
+
attr_reader :path
|
4
|
+
def initialize(path)
|
5
|
+
@path = path
|
6
|
+
end
|
7
|
+
|
8
|
+
[:row, :schema].each do |method_name|
|
9
|
+
define_method(method_name) do
|
10
|
+
self.class.new(File.join(@path, method_name.to_s))
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def read
|
15
|
+
File.read @path
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_s
|
19
|
+
@path
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_path
|
23
|
+
@path
|
24
|
+
end
|
25
|
+
|
26
|
+
def join(name)
|
27
|
+
self.class.new File.join(@path, name)
|
28
|
+
end
|
29
|
+
|
30
|
+
class << self
|
31
|
+
def podfile
|
32
|
+
d_podfile_path = File.join(Dir.pwd, 'DPodfile')
|
33
|
+
unless File.exist?(d_podfile_path)
|
34
|
+
FileUtils.cp(File.join(__dir__, '_DPodfile_'), d_podfile_path)
|
35
|
+
end
|
36
|
+
d_podfile_path
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
data/lib/kaba/prompt.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
class Prompt
|
2
|
+
def initialize(schema, type_name)
|
3
|
+
@schema = schema
|
4
|
+
@type_name = type_name
|
5
|
+
end
|
6
|
+
|
7
|
+
def clear_export
|
8
|
+
@schema.gsub(/export\s+default\s+/, '').gsub(/export\s+/, '')
|
9
|
+
end
|
10
|
+
|
11
|
+
def render(input, export: false)
|
12
|
+
schema = export ? clear_export : @schema
|
13
|
+
request_body = {
|
14
|
+
schema: schema,
|
15
|
+
typeName: @type_name,
|
16
|
+
inpu: input
|
17
|
+
}
|
18
|
+
Application.connection.post('/prompt', request_body).body
|
19
|
+
end
|
20
|
+
|
21
|
+
class << self
|
22
|
+
def file(schema_path, type_name)
|
23
|
+
self.new File.read(File.expand_path schema_path), type_name
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
## 用于将所有数据进行验证
|
2
|
+
class Validate
|
3
|
+
def initialize(schema:, type_name:)
|
4
|
+
@schema = schema
|
5
|
+
@type_name = type_name
|
6
|
+
end
|
7
|
+
|
8
|
+
def run(input)
|
9
|
+
request_body = {
|
10
|
+
schema: @schema,
|
11
|
+
typeName: @type_name,
|
12
|
+
jsonData: input
|
13
|
+
}
|
14
|
+
|
15
|
+
Application.connection.post('/validate', request_body)
|
16
|
+
end
|
17
|
+
|
18
|
+
# 读取某个文件然后运行
|
19
|
+
def run_file(file)
|
20
|
+
input = JSON.parse File.read(File.expand_path file)
|
21
|
+
ValidateReponse.new run(input)
|
22
|
+
end
|
23
|
+
|
24
|
+
# 读取某个文件夹下的然后运行,运行有结果了 block 会被调用
|
25
|
+
# limit 用于限制读取的文件数量, 为 nil 时读取所有文件
|
26
|
+
def run_files(dir, limit: nil, &block)
|
27
|
+
files = Dir[File.join(File.expand_path(dir), '*.target.json')]
|
28
|
+
files = files.first(limit) if limit
|
29
|
+
|
30
|
+
progressbar = TTY::ProgressBar.new("Validate: [:bar] :percent :current/:total", total: files.size)
|
31
|
+
|
32
|
+
Async do
|
33
|
+
files.each do |file|
|
34
|
+
Async do
|
35
|
+
input = JSON.parse(File.read(file))
|
36
|
+
response = ValidateReponse.new run(input), file: file
|
37
|
+
|
38
|
+
if block
|
39
|
+
block.call(response, progressbar)
|
40
|
+
else
|
41
|
+
unless response.success?
|
42
|
+
progressbar.log "validate failed".colorize(:red)
|
43
|
+
response.to_s.split("\n").each do |line|
|
44
|
+
progressbar.log line
|
45
|
+
end
|
46
|
+
progressbar.log "\n"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
progressbar.advance
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end.wait
|
53
|
+
end
|
54
|
+
|
55
|
+
class ValidateReponse
|
56
|
+
attr_reader :response, :body, :file
|
57
|
+
|
58
|
+
def initialize(response, file: nil)
|
59
|
+
@response = response
|
60
|
+
@body = JSON.parse(response.body)
|
61
|
+
@file = file
|
62
|
+
end
|
63
|
+
|
64
|
+
def success?
|
65
|
+
@response.status == 200 && @body["success"]
|
66
|
+
end
|
67
|
+
|
68
|
+
def message
|
69
|
+
@body["message"]
|
70
|
+
end
|
71
|
+
|
72
|
+
def data
|
73
|
+
@body["data"]
|
74
|
+
end
|
75
|
+
|
76
|
+
def to_s
|
77
|
+
s = "#{'success:'.colorize(:bold_blue)} #{success? ? 'true'.colorize(:green) : 'false'.colorize(:red)}"
|
78
|
+
s += "\n#{'file:'.colorize(:bold_blue)} #{file.colorize(:yellow)}"
|
79
|
+
s += "\n#{'message:'.colorize(:bold_blue)} #{message.colorize(:yellow)}" unless success?
|
80
|
+
s += "\n\n"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
data/lib/kaba/version.rb
ADDED
data/lib/kaba.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "kaba/version"
|
4
|
+
require_relative "kaba/dataset"
|
5
|
+
require_relative "kaba/dataset_source"
|
6
|
+
require_relative "kaba/prompt"
|
7
|
+
require_relative "kaba/validate"
|
8
|
+
|
9
|
+
module Kaba
|
10
|
+
class Error < StandardError; end
|
11
|
+
# Your code goes here...
|
12
|
+
end
|
data/sig/kaba.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kaba
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- MJ
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: async
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.20'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.20'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: faraday
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.12'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.12'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: async-http-faraday
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.19.0
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.19.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: colorize
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.1'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.1'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: tty-progressbar
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.18.3
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.18.3
|
83
|
+
description: 用来做数据集的工具
|
84
|
+
email:
|
85
|
+
- tywf91@gmail.com
|
86
|
+
executables:
|
87
|
+
- kaba
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- README.md
|
92
|
+
- Rakefile
|
93
|
+
- exe/kaba
|
94
|
+
- kaba.gemspec
|
95
|
+
- lib/kaba.rb
|
96
|
+
- lib/kaba/_DPodfile_
|
97
|
+
- lib/kaba/dataset.rb
|
98
|
+
- lib/kaba/dataset_source.rb
|
99
|
+
- lib/kaba/prompt.rb
|
100
|
+
- lib/kaba/validate.rb
|
101
|
+
- lib/kaba/version.rb
|
102
|
+
- sig/kaba.rbs
|
103
|
+
homepage: https://github.com/mjason/kaba.git
|
104
|
+
licenses: []
|
105
|
+
metadata:
|
106
|
+
homepage_uri: https://github.com/mjason/kaba.git
|
107
|
+
source_code_uri: https://github.com/mjason/kaba.git
|
108
|
+
changelog_uri: https://github.com/mjason/kaba.git
|
109
|
+
post_install_message:
|
110
|
+
rdoc_options: []
|
111
|
+
require_paths:
|
112
|
+
- lib
|
113
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 3.3.0
|
118
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
123
|
+
requirements: []
|
124
|
+
rubygems_version: 3.5.3
|
125
|
+
signing_key:
|
126
|
+
specification_version: 4
|
127
|
+
summary: 用来做数据集的工具
|
128
|
+
test_files: []
|