kaba 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7d0d15c1454667339192ecb68a40a0128221f0e2f176c9668fdddbaf6507a143
4
+ data.tar.gz: 63d4b6088c24453874a2acf9a1c183b6d7468a0297c43cf32feccd0e838f84c0
5
+ SHA512:
6
+ metadata.gz: 0e937f2aea17053fd01b8a4b5f76f61d58d2817eed4e12942f4edaee0ef72b8cf2fc2c3febeb082d49131fad6aba495bfc7385d59bd13c127b8faaa0ad1a7719
7
+ data.tar.gz: 592a977e7ec2c0b80f76e9f4c9fa0875357466ab1e6cbb05e9dec0fdabe086d8f25fda39caad60d0334ed145a69483ef6ea091d25bc5c39299c9afc5380d6155
data/README.md ADDED
@@ -0,0 +1,9 @@
1
+ # Kaba
2
+
3
+ ## 目录结构
4
+ - data
5
+ - row
6
+ - schema
7
+
8
+ ## 命令
9
+ `gem install kaba`
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ task default: %i[]
data/exe/kaba ADDED
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+ require "bundler/setup"
3
+
4
+ require 'async'
5
+ require 'faraday'
6
+ require 'colorize'
7
+ require 'tty-progressbar'
8
+ require 'async/http/faraday'
9
+
10
+ require 'json'
11
+ require "kaba"
12
+
13
+ class Application
14
+ class << self
15
+ def connection
16
+ @connection ||= Faraday.new('https://lisa-typechat.listenai.com') do |faraday|
17
+ faraday.adapter :async_http, clients: Async::HTTP::Faraday::PersistentClients
18
+ faraday.request :json
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ def ddbug
25
+ require 'irb'
26
+ binding.irb
27
+ end
28
+
29
+ # 运行 DPodfile 文件,DPodfile 是一个 Ruby 文件
30
+ load DatasetSource.podfile
data/kaba.gemspec ADDED
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/kaba/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "kaba"
7
+ spec.version = Kaba::VERSION
8
+ spec.authors = ["MJ"]
9
+ spec.email = ["tywf91@gmail.com"]
10
+
11
+ spec.summary = "用来做数据集的工具"
12
+ spec.description = "用来做数据集的工具"
13
+ spec.homepage = "https://github.com/mjason/kaba.git"
14
+ spec.required_ruby_version = ">= 3.3.0"
15
+
16
+ # spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
17
+
18
+ spec.metadata["homepage_uri"] = spec.homepage
19
+ spec.metadata["source_code_uri"] = "https://github.com/mjason/kaba.git"
20
+ spec.metadata["changelog_uri"] = "https://github.com/mjason/kaba.git"
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(__dir__) do
25
+ `git ls-files -z`.split("\x0").reject do |f|
26
+ (File.expand_path(f) == __FILE__) ||
27
+ f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
28
+ end
29
+ end
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ spec.add_dependency "async", "~> 2.20"
35
+ spec.add_dependency "faraday", "~> 2.12"
36
+ spec.add_dependency "async-http-faraday", "~> 0.19.0"
37
+ spec.add_dependency "colorize", "~> 1.1"
38
+ spec.add_dependency "tty-progressbar", "~> 0.18.3"
39
+
40
+ # For more information and examples about making a new gem, check out our
41
+ # guide at: https://bundler.io/guides/creating_gem.html
42
+ end
@@ -0,0 +1,51 @@
1
+ ## 使用 Ruby 语言编写的数据集校验脚本
2
+ # ddbug 断点调试
3
+ # 使用 colorize 来输出带颜色的信息,https://github.com/fazibear/colorize
4
+ # 使用 progressbar 来显示进度条,https://github.com/piotrmurach/tty-progressbar
5
+ # 设置数据集目录, 如果使用 Docker 方式运行,需要将数据集挂载到 /data 目录下,DatasetSource 会自动加载 /data 目录下的数据集
6
+ source = DatasetSource.new(File.join(__dir__, 'data'))
7
+ schema = source.schema.join('resume.ts').read
8
+ type_name = 'Resume'
9
+ prompt = Prompt.new(schema, type_name)
10
+ validate = Validate.new(schema: schema, type_name: type_name)
11
+ dataset = Dataset.new(source.row, prompt)
12
+
13
+ # #
14
+ # # 可以使用 run_file 来验证单个文件
15
+ # response = validate.run_file(source.row.join('1.target.json'))
16
+ # puts response
17
+ # #
18
+
19
+ ## 一般来说直接 run_files 就可以了,支持 limit 来限制读取的文件数量
20
+ validate.run_files(source.row)
21
+
22
+ ## 数据集处理部分
23
+ # 支持 limit 来限制读取的文件数量
24
+ dataset.scan()
25
+ dataset.save(source.join('train.jsonl'))
26
+
27
+ ## 高级玩法,不要轻易尝试
28
+ #
29
+ ### 可以加入 limit 来限制读取的文件数量,validate.run_files('./data/row', limit: 1) do |response, json, file|
30
+ # validate.run_files(source.row) do |response, progressbar|
31
+ # progressbar.log response.to_s unless response.success?
32
+ # end
33
+ #
34
+ #
35
+ ## 也支持 limit 来进行限制
36
+ # Async do
37
+ # dataset.each do |row, ds|
38
+ # Async do
39
+ # instruction = prompt.render(File.read row.input_file)
40
+ # target = <<~Markdown
41
+ # ```json
42
+ # #{JSON.pretty_generate(JSON.parse(File.read(row.target_path)))}
43
+ # ```
44
+ # Markdown
45
+ # ds.add({ instruction: instruction, output: target })
46
+
47
+ # instruction = prompt.render(File.read(row.input_file), export: true)
48
+ # ds.add({ instruction: instruction, output: target })
49
+ # end
50
+ # end
51
+ # end.wait
@@ -0,0 +1,68 @@
1
+ class Dataset
2
+ attr_reader :lines
3
+
4
+ def initialize(data_dir, prompt)
5
+ @data_files = Dir.glob(File.join(File.expand_path(data_dir), '*.target.json'))
6
+ @lines = []
7
+ @prompt = prompt
8
+ end
9
+
10
+ ## 实现一个 each 方法,可以让用户通过 block 的方式遍历数据集,提供一个 add 方法,可以将数据添加到数据集中
11
+ def _each(limit: nil)
12
+ @data_files.first(limit || @data_files.size).each do |file|
13
+ yield(Row.new(file), self)
14
+ end
15
+ end
16
+
17
+ def each(limit: nil, &block)
18
+ puts "Waring: each is very dangerous".colorize(:red)
19
+ _each(limit: limit, &block)
20
+ end
21
+
22
+ def add(data)
23
+ @lines << data
24
+ end
25
+
26
+ def save(file_path)
27
+ File.open(File.expand_path(file_path), 'w') do |file|
28
+ @lines.each do |line|
29
+ file.puts(line.to_json)
30
+ end
31
+ end
32
+ end
33
+
34
+ def validate
35
+ @lines.size == (@data_files.size * 2)
36
+ end
37
+
38
+ def scan(limit: nil)
39
+ progressbar = TTY::ProgressBar.new("Dataset: [:bar] :percent :current/:total", total: @data_files.size)
40
+ Async do
41
+ _each(limit: limit) do |row, ds|
42
+ Async do
43
+ instruction = @prompt.render(File.read row.input_file)
44
+ target = <<~Markdown
45
+ ```json
46
+ #{JSON.pretty_generate(JSON.parse(File.read(row.target_path)))}
47
+ ```
48
+ Markdown
49
+ ds.add({ instruction: instruction, output: target })
50
+
51
+ instruction = @prompt.render(File.read(row.input_file), export: true)
52
+ ds.add({ instruction: instruction, output: target })
53
+
54
+ progressbar.advance
55
+ end
56
+ end
57
+ end.wait
58
+ end
59
+
60
+ end
61
+
62
+ class Row
63
+ attr_reader :target_path, :input_file
64
+ def initialize(file)
65
+ @target_path = File.expand_path(file)
66
+ @input_file = @target_path.sub(/\.target\.json$/, '.input.txt')
67
+ end
68
+ end
@@ -0,0 +1,41 @@
1
+ class DatasetSource
2
+
3
+ attr_reader :path
4
+ def initialize(path)
5
+ @path = path
6
+ end
7
+
8
+ [:row, :schema].each do |method_name|
9
+ define_method(method_name) do
10
+ self.class.new(File.join(@path, method_name.to_s))
11
+ end
12
+ end
13
+
14
+ def read
15
+ File.read @path
16
+ end
17
+
18
+ def to_s
19
+ @path
20
+ end
21
+
22
+ def to_path
23
+ @path
24
+ end
25
+
26
+ def join(name)
27
+ self.class.new File.join(@path, name)
28
+ end
29
+
30
+ class << self
31
+ def podfile
32
+ d_podfile_path = File.join(Dir.pwd, 'DPodfile')
33
+ unless File.exist?(d_podfile_path)
34
+ FileUtils.cp(File.join(__dir__, '_DPodfile_'), d_podfile_path)
35
+ end
36
+ d_podfile_path
37
+ end
38
+ end
39
+
40
+ end
41
+
@@ -0,0 +1,26 @@
1
+ class Prompt
2
+ def initialize(schema, type_name)
3
+ @schema = schema
4
+ @type_name = type_name
5
+ end
6
+
7
+ def clear_export
8
+ @schema.gsub(/export\s+default\s+/, '').gsub(/export\s+/, '')
9
+ end
10
+
11
+ def render(input, export: false)
12
+ schema = export ? clear_export : @schema
13
+ request_body = {
14
+ schema: schema,
15
+ typeName: @type_name,
16
+ inpu: input
17
+ }
18
+ Application.connection.post('/prompt', request_body).body
19
+ end
20
+
21
+ class << self
22
+ def file(schema_path, type_name)
23
+ self.new File.read(File.expand_path schema_path), type_name
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,84 @@
1
+ ## 用于将所有数据进行验证
2
+ class Validate
3
+ def initialize(schema:, type_name:)
4
+ @schema = schema
5
+ @type_name = type_name
6
+ end
7
+
8
+ def run(input)
9
+ request_body = {
10
+ schema: @schema,
11
+ typeName: @type_name,
12
+ jsonData: input
13
+ }
14
+
15
+ Application.connection.post('/validate', request_body)
16
+ end
17
+
18
+ # 读取某个文件然后运行
19
+ def run_file(file)
20
+ input = JSON.parse File.read(File.expand_path file)
21
+ ValidateReponse.new run(input)
22
+ end
23
+
24
+ # 读取某个文件夹下的然后运行,运行有结果了 block 会被调用
25
+ # limit 用于限制读取的文件数量, 为 nil 时读取所有文件
26
+ def run_files(dir, limit: nil, &block)
27
+ files = Dir[File.join(File.expand_path(dir), '*.target.json')]
28
+ files = files.first(limit) if limit
29
+
30
+ progressbar = TTY::ProgressBar.new("Validate: [:bar] :percent :current/:total", total: files.size)
31
+
32
+ Async do
33
+ files.each do |file|
34
+ Async do
35
+ input = JSON.parse(File.read(file))
36
+ response = ValidateReponse.new run(input), file: file
37
+
38
+ if block
39
+ block.call(response, progressbar)
40
+ else
41
+ unless response.success?
42
+ progressbar.log "validate failed".colorize(:red)
43
+ response.to_s.split("\n").each do |line|
44
+ progressbar.log line
45
+ end
46
+ progressbar.log "\n"
47
+ end
48
+ end
49
+ progressbar.advance
50
+ end
51
+ end
52
+ end.wait
53
+ end
54
+
55
+ class ValidateReponse
56
+ attr_reader :response, :body, :file
57
+
58
+ def initialize(response, file: nil)
59
+ @response = response
60
+ @body = JSON.parse(response.body)
61
+ @file = file
62
+ end
63
+
64
+ def success?
65
+ @response.status == 200 && @body["success"]
66
+ end
67
+
68
+ def message
69
+ @body["message"]
70
+ end
71
+
72
+ def data
73
+ @body["data"]
74
+ end
75
+
76
+ def to_s
77
+ s = "#{'success:'.colorize(:bold_blue)} #{success? ? 'true'.colorize(:green) : 'false'.colorize(:red)}"
78
+ s += "\n#{'file:'.colorize(:bold_blue)} #{file.colorize(:yellow)}"
79
+ s += "\n#{'message:'.colorize(:bold_blue)} #{message.colorize(:yellow)}" unless success?
80
+ s += "\n\n"
81
+ end
82
+ end
83
+
84
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kaba
4
+ VERSION = "0.1.0"
5
+ end
data/lib/kaba.rb ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "kaba/version"
4
+ require_relative "kaba/dataset"
5
+ require_relative "kaba/dataset_source"
6
+ require_relative "kaba/prompt"
7
+ require_relative "kaba/validate"
8
+
9
+ module Kaba
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+ end
data/sig/kaba.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Kaba
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kaba
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - MJ
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-11-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: async
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.20'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.20'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faraday
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.12'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: async-http-faraday
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.19.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.19.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: colorize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: tty-progressbar
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.18.3
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.18.3
83
+ description: 用来做数据集的工具
84
+ email:
85
+ - tywf91@gmail.com
86
+ executables:
87
+ - kaba
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - README.md
92
+ - Rakefile
93
+ - exe/kaba
94
+ - kaba.gemspec
95
+ - lib/kaba.rb
96
+ - lib/kaba/_DPodfile_
97
+ - lib/kaba/dataset.rb
98
+ - lib/kaba/dataset_source.rb
99
+ - lib/kaba/prompt.rb
100
+ - lib/kaba/validate.rb
101
+ - lib/kaba/version.rb
102
+ - sig/kaba.rbs
103
+ homepage: https://github.com/mjason/kaba.git
104
+ licenses: []
105
+ metadata:
106
+ homepage_uri: https://github.com/mjason/kaba.git
107
+ source_code_uri: https://github.com/mjason/kaba.git
108
+ changelog_uri: https://github.com/mjason/kaba.git
109
+ post_install_message:
110
+ rdoc_options: []
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: 3.3.0
118
+ required_rubygems_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ requirements: []
124
+ rubygems_version: 3.5.3
125
+ signing_key:
126
+ specification_version: 4
127
+ summary: 用来做数据集的工具
128
+ test_files: []