kaba 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +9 -0
- data/Rakefile +4 -0
- data/exe/kaba +30 -0
- data/kaba.gemspec +42 -0
- data/lib/kaba/_DPodfile_ +51 -0
- data/lib/kaba/dataset.rb +68 -0
- data/lib/kaba/dataset_source.rb +41 -0
- data/lib/kaba/prompt.rb +26 -0
- data/lib/kaba/validate.rb +84 -0
- data/lib/kaba/version.rb +5 -0
- data/lib/kaba.rb +12 -0
- data/sig/kaba.rbs +4 -0
- metadata +128 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7d0d15c1454667339192ecb68a40a0128221f0e2f176c9668fdddbaf6507a143
|
4
|
+
data.tar.gz: 63d4b6088c24453874a2acf9a1c183b6d7468a0297c43cf32feccd0e838f84c0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0e937f2aea17053fd01b8a4b5f76f61d58d2817eed4e12942f4edaee0ef72b8cf2fc2c3febeb082d49131fad6aba495bfc7385d59bd13c127b8faaa0ad1a7719
|
7
|
+
data.tar.gz: 592a977e7ec2c0b80f76e9f4c9fa0875357466ab1e6cbb05e9dec0fdabe086d8f25fda39caad60d0334ed145a69483ef6ea091d25bc5c39299c9afc5380d6155
|
data/README.md
ADDED
data/Rakefile
ADDED
data/exe/kaba
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "bundler/setup"
|
3
|
+
|
4
|
+
require 'async'
|
5
|
+
require 'faraday'
|
6
|
+
require 'colorize'
|
7
|
+
require 'tty-progressbar'
|
8
|
+
require 'async/http/faraday'
|
9
|
+
|
10
|
+
require 'json'
|
11
|
+
require "kaba"
|
12
|
+
|
13
|
+
class Application
|
14
|
+
class << self
|
15
|
+
def connection
|
16
|
+
@connection ||= Faraday.new('https://lisa-typechat.listenai.com') do |faraday|
|
17
|
+
faraday.adapter :async_http, clients: Async::HTTP::Faraday::PersistentClients
|
18
|
+
faraday.request :json
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def ddbug
|
25
|
+
require 'irb'
|
26
|
+
binding.irb
|
27
|
+
end
|
28
|
+
|
29
|
+
# 运行 DPodfile 文件,DPodfile 是一个 Ruby 文件
|
30
|
+
load DatasetSource.podfile
|
data/kaba.gemspec
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/kaba/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "kaba"
|
7
|
+
spec.version = Kaba::VERSION
|
8
|
+
spec.authors = ["MJ"]
|
9
|
+
spec.email = ["tywf91@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "用来做数据集的工具"
|
12
|
+
spec.description = "用来做数据集的工具"
|
13
|
+
spec.homepage = "https://github.com/mjason/kaba.git"
|
14
|
+
spec.required_ruby_version = ">= 3.3.0"
|
15
|
+
|
16
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
|
17
|
+
|
18
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
19
|
+
spec.metadata["source_code_uri"] = "https://github.com/mjason/kaba.git"
|
20
|
+
spec.metadata["changelog_uri"] = "https://github.com/mjason/kaba.git"
|
21
|
+
|
22
|
+
# Specify which files should be added to the gem when it is released.
|
23
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
24
|
+
spec.files = Dir.chdir(__dir__) do
|
25
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
26
|
+
(File.expand_path(f) == __FILE__) ||
|
27
|
+
f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
|
28
|
+
end
|
29
|
+
end
|
30
|
+
spec.bindir = "exe"
|
31
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
32
|
+
spec.require_paths = ["lib"]
|
33
|
+
|
34
|
+
spec.add_dependency "async", "~> 2.20"
|
35
|
+
spec.add_dependency "faraday", "~> 2.12"
|
36
|
+
spec.add_dependency "async-http-faraday", "~> 0.19.0"
|
37
|
+
spec.add_dependency "colorize", "~> 1.1"
|
38
|
+
spec.add_dependency "tty-progressbar", "~> 0.18.3"
|
39
|
+
|
40
|
+
# For more information and examples about making a new gem, check out our
|
41
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
42
|
+
end
|
data/lib/kaba/_DPodfile_
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
## 使用 Ruby 语言编写的数据集校验脚本
|
2
|
+
# ddbug 断点调试
|
3
|
+
# 使用 colorize 来输出带颜色的信息,https://github.com/fazibear/colorize
|
4
|
+
# 使用 progressbar 来显示进度条,https://github.com/piotrmurach/tty-progressbar
|
5
|
+
# 设置数据集目录, 如果使用 Docker 方式运行,需要将数据集挂载到 /data 目录下,DatasetSource 会自动加载 /data 目录下的数据集
|
6
|
+
source = DatasetSource.new(File.join(__dir__, 'data'))
|
7
|
+
schema = source.schema.join('resume.ts').read
|
8
|
+
type_name = 'Resume'
|
9
|
+
prompt = Prompt.new(schema, type_name)
|
10
|
+
validate = Validate.new(schema: schema, type_name: type_name)
|
11
|
+
dataset = Dataset.new(source.row, prompt)
|
12
|
+
|
13
|
+
# #
|
14
|
+
# # 可以使用 run_file 来验证单个文件
|
15
|
+
# response = validate.run_file(source.row.join('1.target.json'))
|
16
|
+
# puts response
|
17
|
+
# #
|
18
|
+
|
19
|
+
## 一般来说直接 run_files 就可以了,支持 limit 来限制读取的文件数量
|
20
|
+
validate.run_files(source.row)
|
21
|
+
|
22
|
+
## 数据集处理部分
|
23
|
+
# 支持 limit 来限制读取的文件数量
|
24
|
+
dataset.scan()
|
25
|
+
dataset.save(source.join('train.jsonl'))
|
26
|
+
|
27
|
+
## 高级玩法,不要轻易尝试
|
28
|
+
#
|
29
|
+
### 可以加入 limit 来限制读取的文件数量,validate.run_files('./data/row', limit: 1) do |response, json, file|
|
30
|
+
# validate.run_files(source.row) do |response, progressbar|
|
31
|
+
# progressbar.log response.to_s unless response.success?
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
#
|
35
|
+
## 也支持 limit 来进行限制
|
36
|
+
# Async do
|
37
|
+
# dataset.each do |row, ds|
|
38
|
+
# Async do
|
39
|
+
# instruction = prompt.render(File.read row.input_file)
|
40
|
+
# target = <<~Markdown
|
41
|
+
# ```json
|
42
|
+
# #{JSON.pretty_generate(JSON.parse(File.read(row.target_path)))}
|
43
|
+
# ```
|
44
|
+
# Markdown
|
45
|
+
# ds.add({ instruction: instruction, output: target })
|
46
|
+
|
47
|
+
# instruction = prompt.render(File.read(row.input_file), export: true)
|
48
|
+
# ds.add({ instruction: instruction, output: target })
|
49
|
+
# end
|
50
|
+
# end
|
51
|
+
# end.wait
|
data/lib/kaba/dataset.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
class Dataset
|
2
|
+
attr_reader :lines
|
3
|
+
|
4
|
+
def initialize(data_dir, prompt)
|
5
|
+
@data_files = Dir.glob(File.join(File.expand_path(data_dir), '*.target.json'))
|
6
|
+
@lines = []
|
7
|
+
@prompt = prompt
|
8
|
+
end
|
9
|
+
|
10
|
+
## 实现一个 each 方法,可以让用户通过 block 的方式遍历数据集,提供一个 add 方法,可以将数据添加到数据集中
|
11
|
+
def _each(limit: nil)
|
12
|
+
@data_files.first(limit || @data_files.size).each do |file|
|
13
|
+
yield(Row.new(file), self)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def each(limit: nil, &block)
|
18
|
+
puts "Waring: each is very dangerous".colorize(:red)
|
19
|
+
_each(limit: limit, &block)
|
20
|
+
end
|
21
|
+
|
22
|
+
def add(data)
|
23
|
+
@lines << data
|
24
|
+
end
|
25
|
+
|
26
|
+
def save(file_path)
|
27
|
+
File.open(File.expand_path(file_path), 'w') do |file|
|
28
|
+
@lines.each do |line|
|
29
|
+
file.puts(line.to_json)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def validate
|
35
|
+
@lines.size == (@data_files.size * 2)
|
36
|
+
end
|
37
|
+
|
38
|
+
def scan(limit: nil)
|
39
|
+
progressbar = TTY::ProgressBar.new("Dataset: [:bar] :percent :current/:total", total: @data_files.size)
|
40
|
+
Async do
|
41
|
+
_each(limit: limit) do |row, ds|
|
42
|
+
Async do
|
43
|
+
instruction = @prompt.render(File.read row.input_file)
|
44
|
+
target = <<~Markdown
|
45
|
+
```json
|
46
|
+
#{JSON.pretty_generate(JSON.parse(File.read(row.target_path)))}
|
47
|
+
```
|
48
|
+
Markdown
|
49
|
+
ds.add({ instruction: instruction, output: target })
|
50
|
+
|
51
|
+
instruction = @prompt.render(File.read(row.input_file), export: true)
|
52
|
+
ds.add({ instruction: instruction, output: target })
|
53
|
+
|
54
|
+
progressbar.advance
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end.wait
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
class Row
|
63
|
+
attr_reader :target_path, :input_file
|
64
|
+
def initialize(file)
|
65
|
+
@target_path = File.expand_path(file)
|
66
|
+
@input_file = @target_path.sub(/\.target\.json$/, '.input.txt')
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
class DatasetSource
|
2
|
+
|
3
|
+
attr_reader :path
|
4
|
+
def initialize(path)
|
5
|
+
@path = path
|
6
|
+
end
|
7
|
+
|
8
|
+
[:row, :schema].each do |method_name|
|
9
|
+
define_method(method_name) do
|
10
|
+
self.class.new(File.join(@path, method_name.to_s))
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def read
|
15
|
+
File.read @path
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_s
|
19
|
+
@path
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_path
|
23
|
+
@path
|
24
|
+
end
|
25
|
+
|
26
|
+
def join(name)
|
27
|
+
self.class.new File.join(@path, name)
|
28
|
+
end
|
29
|
+
|
30
|
+
class << self
|
31
|
+
def podfile
|
32
|
+
d_podfile_path = File.join(Dir.pwd, 'DPodfile')
|
33
|
+
unless File.exist?(d_podfile_path)
|
34
|
+
FileUtils.cp(File.join(__dir__, '_DPodfile_'), d_podfile_path)
|
35
|
+
end
|
36
|
+
d_podfile_path
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
data/lib/kaba/prompt.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
class Prompt
|
2
|
+
def initialize(schema, type_name)
|
3
|
+
@schema = schema
|
4
|
+
@type_name = type_name
|
5
|
+
end
|
6
|
+
|
7
|
+
def clear_export
|
8
|
+
@schema.gsub(/export\s+default\s+/, '').gsub(/export\s+/, '')
|
9
|
+
end
|
10
|
+
|
11
|
+
def render(input, export: false)
|
12
|
+
schema = export ? clear_export : @schema
|
13
|
+
request_body = {
|
14
|
+
schema: schema,
|
15
|
+
typeName: @type_name,
|
16
|
+
inpu: input
|
17
|
+
}
|
18
|
+
Application.connection.post('/prompt', request_body).body
|
19
|
+
end
|
20
|
+
|
21
|
+
class << self
|
22
|
+
def file(schema_path, type_name)
|
23
|
+
self.new File.read(File.expand_path schema_path), type_name
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
## 用于将所有数据进行验证
|
2
|
+
class Validate
|
3
|
+
def initialize(schema:, type_name:)
|
4
|
+
@schema = schema
|
5
|
+
@type_name = type_name
|
6
|
+
end
|
7
|
+
|
8
|
+
def run(input)
|
9
|
+
request_body = {
|
10
|
+
schema: @schema,
|
11
|
+
typeName: @type_name,
|
12
|
+
jsonData: input
|
13
|
+
}
|
14
|
+
|
15
|
+
Application.connection.post('/validate', request_body)
|
16
|
+
end
|
17
|
+
|
18
|
+
# 读取某个文件然后运行
|
19
|
+
def run_file(file)
|
20
|
+
input = JSON.parse File.read(File.expand_path file)
|
21
|
+
ValidateReponse.new run(input)
|
22
|
+
end
|
23
|
+
|
24
|
+
# 读取某个文件夹下的然后运行,运行有结果了 block 会被调用
|
25
|
+
# limit 用于限制读取的文件数量, 为 nil 时读取所有文件
|
26
|
+
def run_files(dir, limit: nil, &block)
|
27
|
+
files = Dir[File.join(File.expand_path(dir), '*.target.json')]
|
28
|
+
files = files.first(limit) if limit
|
29
|
+
|
30
|
+
progressbar = TTY::ProgressBar.new("Validate: [:bar] :percent :current/:total", total: files.size)
|
31
|
+
|
32
|
+
Async do
|
33
|
+
files.each do |file|
|
34
|
+
Async do
|
35
|
+
input = JSON.parse(File.read(file))
|
36
|
+
response = ValidateReponse.new run(input), file: file
|
37
|
+
|
38
|
+
if block
|
39
|
+
block.call(response, progressbar)
|
40
|
+
else
|
41
|
+
unless response.success?
|
42
|
+
progressbar.log "validate failed".colorize(:red)
|
43
|
+
response.to_s.split("\n").each do |line|
|
44
|
+
progressbar.log line
|
45
|
+
end
|
46
|
+
progressbar.log "\n"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
progressbar.advance
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end.wait
|
53
|
+
end
|
54
|
+
|
55
|
+
class ValidateReponse
|
56
|
+
attr_reader :response, :body, :file
|
57
|
+
|
58
|
+
def initialize(response, file: nil)
|
59
|
+
@response = response
|
60
|
+
@body = JSON.parse(response.body)
|
61
|
+
@file = file
|
62
|
+
end
|
63
|
+
|
64
|
+
def success?
|
65
|
+
@response.status == 200 && @body["success"]
|
66
|
+
end
|
67
|
+
|
68
|
+
def message
|
69
|
+
@body["message"]
|
70
|
+
end
|
71
|
+
|
72
|
+
def data
|
73
|
+
@body["data"]
|
74
|
+
end
|
75
|
+
|
76
|
+
def to_s
|
77
|
+
s = "#{'success:'.colorize(:bold_blue)} #{success? ? 'true'.colorize(:green) : 'false'.colorize(:red)}"
|
78
|
+
s += "\n#{'file:'.colorize(:bold_blue)} #{file.colorize(:yellow)}"
|
79
|
+
s += "\n#{'message:'.colorize(:bold_blue)} #{message.colorize(:yellow)}" unless success?
|
80
|
+
s += "\n\n"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
data/lib/kaba/version.rb
ADDED
data/lib/kaba.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "kaba/version"
|
4
|
+
require_relative "kaba/dataset"
|
5
|
+
require_relative "kaba/dataset_source"
|
6
|
+
require_relative "kaba/prompt"
|
7
|
+
require_relative "kaba/validate"
|
8
|
+
|
9
|
+
module Kaba
|
10
|
+
class Error < StandardError; end
|
11
|
+
# Your code goes here...
|
12
|
+
end
|
data/sig/kaba.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kaba
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- MJ
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: async
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.20'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.20'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: faraday
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.12'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.12'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: async-http-faraday
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.19.0
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.19.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: colorize
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.1'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.1'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: tty-progressbar
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.18.3
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.18.3
|
83
|
+
description: 用来做数据集的工具
|
84
|
+
email:
|
85
|
+
- tywf91@gmail.com
|
86
|
+
executables:
|
87
|
+
- kaba
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- README.md
|
92
|
+
- Rakefile
|
93
|
+
- exe/kaba
|
94
|
+
- kaba.gemspec
|
95
|
+
- lib/kaba.rb
|
96
|
+
- lib/kaba/_DPodfile_
|
97
|
+
- lib/kaba/dataset.rb
|
98
|
+
- lib/kaba/dataset_source.rb
|
99
|
+
- lib/kaba/prompt.rb
|
100
|
+
- lib/kaba/validate.rb
|
101
|
+
- lib/kaba/version.rb
|
102
|
+
- sig/kaba.rbs
|
103
|
+
homepage: https://github.com/mjason/kaba.git
|
104
|
+
licenses: []
|
105
|
+
metadata:
|
106
|
+
homepage_uri: https://github.com/mjason/kaba.git
|
107
|
+
source_code_uri: https://github.com/mjason/kaba.git
|
108
|
+
changelog_uri: https://github.com/mjason/kaba.git
|
109
|
+
post_install_message:
|
110
|
+
rdoc_options: []
|
111
|
+
require_paths:
|
112
|
+
- lib
|
113
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 3.3.0
|
118
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
123
|
+
requirements: []
|
124
|
+
rubygems_version: 3.5.3
|
125
|
+
signing_key:
|
126
|
+
specification_version: 4
|
127
|
+
summary: 用来做数据集的工具
|
128
|
+
test_files: []
|