kaba 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7d0d15c1454667339192ecb68a40a0128221f0e2f176c9668fdddbaf6507a143
4
+ data.tar.gz: 63d4b6088c24453874a2acf9a1c183b6d7468a0297c43cf32feccd0e838f84c0
5
+ SHA512:
6
+ metadata.gz: 0e937f2aea17053fd01b8a4b5f76f61d58d2817eed4e12942f4edaee0ef72b8cf2fc2c3febeb082d49131fad6aba495bfc7385d59bd13c127b8faaa0ad1a7719
7
+ data.tar.gz: 592a977e7ec2c0b80f76e9f4c9fa0875357466ab1e6cbb05e9dec0fdabe086d8f25fda39caad60d0334ed145a69483ef6ea091d25bc5c39299c9afc5380d6155
data/README.md ADDED
@@ -0,0 +1,9 @@
1
+ # Kaba
2
+
3
+ ## 目录结构
4
+ - data
5
+ - row
6
+ - schema
7
+
8
+ ## 命令
9
+ `gem install kaba`
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ task default: %i[]
data/exe/kaba ADDED
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+ require "bundler/setup"
3
+
4
+ require 'async'
5
+ require 'faraday'
6
+ require 'colorize'
7
+ require 'tty-progressbar'
8
+ require 'async/http/faraday'
9
+
10
+ require 'json'
11
+ require "kaba"
12
+
13
+ class Application
14
+ class << self
15
+ def connection
16
+ @connection ||= Faraday.new('https://lisa-typechat.listenai.com') do |faraday|
17
+ faraday.adapter :async_http, clients: Async::HTTP::Faraday::PersistentClients
18
+ faraday.request :json
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ def ddbug
25
+ require 'irb'
26
+ binding.irb
27
+ end
28
+
29
+ # 运行 DPodfile 文件,DPodfile 是一个 Ruby 文件
30
+ load DatasetSource.podfile
data/kaba.gemspec ADDED
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/kaba/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "kaba"
7
+ spec.version = Kaba::VERSION
8
+ spec.authors = ["MJ"]
9
+ spec.email = ["tywf91@gmail.com"]
10
+
11
+ spec.summary = "用来做数据集的工具"
12
+ spec.description = "用来做数据集的工具"
13
+ spec.homepage = "https://github.com/mjason/kaba.git"
14
+ spec.required_ruby_version = ">= 3.3.0"
15
+
16
+ # spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
17
+
18
+ spec.metadata["homepage_uri"] = spec.homepage
19
+ spec.metadata["source_code_uri"] = "https://github.com/mjason/kaba.git"
20
+ spec.metadata["changelog_uri"] = "https://github.com/mjason/kaba.git"
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(__dir__) do
25
+ `git ls-files -z`.split("\x0").reject do |f|
26
+ (File.expand_path(f) == __FILE__) ||
27
+ f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
28
+ end
29
+ end
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ spec.add_dependency "async", "~> 2.20"
35
+ spec.add_dependency "faraday", "~> 2.12"
36
+ spec.add_dependency "async-http-faraday", "~> 0.19.0"
37
+ spec.add_dependency "colorize", "~> 1.1"
38
+ spec.add_dependency "tty-progressbar", "~> 0.18.3"
39
+
40
+ # For more information and examples about making a new gem, check out our
41
+ # guide at: https://bundler.io/guides/creating_gem.html
42
+ end
@@ -0,0 +1,51 @@
1
+ ## 使用 Ruby 语言编写的数据集校验脚本
2
+ # ddbug 断点调试
3
+ # 使用 colorize 来输出带颜色的信息,https://github.com/fazibear/colorize
4
+ # 使用 progressbar 来显示进度条,https://github.com/piotrmurach/tty-progressbar
5
+ # 设置数据集目录, 如果使用 Docker 方式运行,需要将数据集挂载到 /data 目录下,DatasetSource 会自动加载 /data 目录下的数据集
6
+ source = DatasetSource.new(File.join(__dir__, 'data'))
7
+ schema = source.schema.join('resume.ts').read
8
+ type_name = 'Resume'
9
+ prompt = Prompt.new(schema, type_name)
10
+ validate = Validate.new(schema: schema, type_name: type_name)
11
+ dataset = Dataset.new(source.row, prompt)
12
+
13
+ # #
14
+ # # 可以使用 run_file 来验证单个文件
15
+ # response = validate.run_file(source.row.join('1.target.json'))
16
+ # puts response
17
+ # #
18
+
19
+ ## 一般来说直接 run_files 就可以了,支持 limit 来限制读取的文件数量
20
+ validate.run_files(source.row)
21
+
22
+ ## 数据集处理部分
23
+ # 支持 limit 来限制读取的文件数量
24
+ dataset.scan()
25
+ dataset.save(source.join('train.jsonl'))
26
+
27
+ ## 高级玩法,不要轻易尝试
28
+ #
29
+ ### 可以加入 limit 来限制读取的文件数量,validate.run_files('./data/row', limit: 1) do |response, json, file|
30
+ # validate.run_files(source.row) do |response, progressbar|
31
+ # progressbar.log response.to_s unless response.success?
32
+ # end
33
+ #
34
+ #
35
+ ## 也支持 limit 来进行限制
36
+ # Async do
37
+ # dataset.each do |row, ds|
38
+ # Async do
39
+ # instruction = prompt.render(File.read row.input_file)
40
+ # target = <<~Markdown
41
+ # ```json
42
+ # #{JSON.pretty_generate(JSON.parse(File.read(row.target_path)))}
43
+ # ```
44
+ # Markdown
45
+ # ds.add({ instruction: instruction, output: target })
46
+
47
+ # instruction = prompt.render(File.read(row.input_file), export: true)
48
+ # ds.add({ instruction: instruction, output: target })
49
+ # end
50
+ # end
51
+ # end.wait
@@ -0,0 +1,68 @@
1
+ class Dataset
2
+ attr_reader :lines
3
+
4
+ def initialize(data_dir, prompt)
5
+ @data_files = Dir.glob(File.join(File.expand_path(data_dir), '*.target.json'))
6
+ @lines = []
7
+ @prompt = prompt
8
+ end
9
+
10
+ ## 实现一个 each 方法,可以让用户通过 block 的方式遍历数据集,提供一个 add 方法,可以将数据添加到数据集中
11
+ def _each(limit: nil)
12
+ @data_files.first(limit || @data_files.size).each do |file|
13
+ yield(Row.new(file), self)
14
+ end
15
+ end
16
+
17
+ def each(limit: nil, &block)
18
+ puts "Waring: each is very dangerous".colorize(:red)
19
+ _each(limit: limit, &block)
20
+ end
21
+
22
+ def add(data)
23
+ @lines << data
24
+ end
25
+
26
+ def save(file_path)
27
+ File.open(File.expand_path(file_path), 'w') do |file|
28
+ @lines.each do |line|
29
+ file.puts(line.to_json)
30
+ end
31
+ end
32
+ end
33
+
34
+ def validate
35
+ @lines.size == (@data_files.size * 2)
36
+ end
37
+
38
+ def scan(limit: nil)
39
+ progressbar = TTY::ProgressBar.new("Dataset: [:bar] :percent :current/:total", total: @data_files.size)
40
+ Async do
41
+ _each(limit: limit) do |row, ds|
42
+ Async do
43
+ instruction = @prompt.render(File.read row.input_file)
44
+ target = <<~Markdown
45
+ ```json
46
+ #{JSON.pretty_generate(JSON.parse(File.read(row.target_path)))}
47
+ ```
48
+ Markdown
49
+ ds.add({ instruction: instruction, output: target })
50
+
51
+ instruction = @prompt.render(File.read(row.input_file), export: true)
52
+ ds.add({ instruction: instruction, output: target })
53
+
54
+ progressbar.advance
55
+ end
56
+ end
57
+ end.wait
58
+ end
59
+
60
+ end
61
+
62
+ class Row
63
+ attr_reader :target_path, :input_file
64
+ def initialize(file)
65
+ @target_path = File.expand_path(file)
66
+ @input_file = @target_path.sub(/\.target\.json$/, '.input.txt')
67
+ end
68
+ end
@@ -0,0 +1,41 @@
1
+ class DatasetSource
2
+
3
+ attr_reader :path
4
+ def initialize(path)
5
+ @path = path
6
+ end
7
+
8
+ [:row, :schema].each do |method_name|
9
+ define_method(method_name) do
10
+ self.class.new(File.join(@path, method_name.to_s))
11
+ end
12
+ end
13
+
14
+ def read
15
+ File.read @path
16
+ end
17
+
18
+ def to_s
19
+ @path
20
+ end
21
+
22
+ def to_path
23
+ @path
24
+ end
25
+
26
+ def join(name)
27
+ self.class.new File.join(@path, name)
28
+ end
29
+
30
+ class << self
31
+ def podfile
32
+ d_podfile_path = File.join(Dir.pwd, 'DPodfile')
33
+ unless File.exist?(d_podfile_path)
34
+ FileUtils.cp(File.join(__dir__, '_DPodfile_'), d_podfile_path)
35
+ end
36
+ d_podfile_path
37
+ end
38
+ end
39
+
40
+ end
41
+
@@ -0,0 +1,26 @@
1
+ class Prompt
2
+ def initialize(schema, type_name)
3
+ @schema = schema
4
+ @type_name = type_name
5
+ end
6
+
7
+ def clear_export
8
+ @schema.gsub(/export\s+default\s+/, '').gsub(/export\s+/, '')
9
+ end
10
+
11
+ def render(input, export: false)
12
+ schema = export ? clear_export : @schema
13
+ request_body = {
14
+ schema: schema,
15
+ typeName: @type_name,
16
+ inpu: input
17
+ }
18
+ Application.connection.post('/prompt', request_body).body
19
+ end
20
+
21
+ class << self
22
+ def file(schema_path, type_name)
23
+ self.new File.read(File.expand_path schema_path), type_name
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,84 @@
1
+ ## 用于将所有数据进行验证
2
+ class Validate
3
+ def initialize(schema:, type_name:)
4
+ @schema = schema
5
+ @type_name = type_name
6
+ end
7
+
8
+ def run(input)
9
+ request_body = {
10
+ schema: @schema,
11
+ typeName: @type_name,
12
+ jsonData: input
13
+ }
14
+
15
+ Application.connection.post('/validate', request_body)
16
+ end
17
+
18
+ # 读取某个文件然后运行
19
+ def run_file(file)
20
+ input = JSON.parse File.read(File.expand_path file)
21
+ ValidateReponse.new run(input)
22
+ end
23
+
24
+ # 读取某个文件夹下的然后运行,运行有结果了 block 会被调用
25
+ # limit 用于限制读取的文件数量, 为 nil 时读取所有文件
26
+ def run_files(dir, limit: nil, &block)
27
+ files = Dir[File.join(File.expand_path(dir), '*.target.json')]
28
+ files = files.first(limit) if limit
29
+
30
+ progressbar = TTY::ProgressBar.new("Validate: [:bar] :percent :current/:total", total: files.size)
31
+
32
+ Async do
33
+ files.each do |file|
34
+ Async do
35
+ input = JSON.parse(File.read(file))
36
+ response = ValidateReponse.new run(input), file: file
37
+
38
+ if block
39
+ block.call(response, progressbar)
40
+ else
41
+ unless response.success?
42
+ progressbar.log "validate failed".colorize(:red)
43
+ response.to_s.split("\n").each do |line|
44
+ progressbar.log line
45
+ end
46
+ progressbar.log "\n"
47
+ end
48
+ end
49
+ progressbar.advance
50
+ end
51
+ end
52
+ end.wait
53
+ end
54
+
55
+ class ValidateReponse
56
+ attr_reader :response, :body, :file
57
+
58
+ def initialize(response, file: nil)
59
+ @response = response
60
+ @body = JSON.parse(response.body)
61
+ @file = file
62
+ end
63
+
64
+ def success?
65
+ @response.status == 200 && @body["success"]
66
+ end
67
+
68
+ def message
69
+ @body["message"]
70
+ end
71
+
72
+ def data
73
+ @body["data"]
74
+ end
75
+
76
+ def to_s
77
+ s = "#{'success:'.colorize(:bold_blue)} #{success? ? 'true'.colorize(:green) : 'false'.colorize(:red)}"
78
+ s += "\n#{'file:'.colorize(:bold_blue)} #{file.colorize(:yellow)}"
79
+ s += "\n#{'message:'.colorize(:bold_blue)} #{message.colorize(:yellow)}" unless success?
80
+ s += "\n\n"
81
+ end
82
+ end
83
+
84
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kaba
4
+ VERSION = "0.1.0"
5
+ end
data/lib/kaba.rb ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "kaba/version"
4
+ require_relative "kaba/dataset"
5
+ require_relative "kaba/dataset_source"
6
+ require_relative "kaba/prompt"
7
+ require_relative "kaba/validate"
8
+
9
+ module Kaba
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+ end
data/sig/kaba.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Kaba
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kaba
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - MJ
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-11-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: async
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.20'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.20'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faraday
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.12'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: async-http-faraday
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.19.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.19.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: colorize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: tty-progressbar
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.18.3
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.18.3
83
+ description: 用来做数据集的工具
84
+ email:
85
+ - tywf91@gmail.com
86
+ executables:
87
+ - kaba
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - README.md
92
+ - Rakefile
93
+ - exe/kaba
94
+ - kaba.gemspec
95
+ - lib/kaba.rb
96
+ - lib/kaba/_DPodfile_
97
+ - lib/kaba/dataset.rb
98
+ - lib/kaba/dataset_source.rb
99
+ - lib/kaba/prompt.rb
100
+ - lib/kaba/validate.rb
101
+ - lib/kaba/version.rb
102
+ - sig/kaba.rbs
103
+ homepage: https://github.com/mjason/kaba.git
104
+ licenses: []
105
+ metadata:
106
+ homepage_uri: https://github.com/mjason/kaba.git
107
+ source_code_uri: https://github.com/mjason/kaba.git
108
+ changelog_uri: https://github.com/mjason/kaba.git
109
+ post_install_message:
110
+ rdoc_options: []
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: 3.3.0
118
+ required_rubygems_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ requirements: []
124
+ rubygems_version: 3.5.3
125
+ signing_key:
126
+ specification_version: 4
127
+ summary: 用来做数据集的工具
128
+ test_files: []