red-datasets 0.0.8 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,95 @@
1
+ require_relative "dataset"
2
+ require_relative "tar-gz-readable"
3
+
4
+ module Datasets
5
+ class RdatasetsList < Dataset
6
+ Record = Struct.new(:package,
7
+ :dataset,
8
+ :title,
9
+ :rows,
10
+ :cols,
11
+ :n_binary,
12
+ :n_character,
13
+ :n_factor,
14
+ :n_logical,
15
+ :n_numeric,
16
+ :csv,
17
+ :doc)
18
+
19
+ def initialize
20
+ super
21
+ @metadata.id = "rdatasets"
22
+ @metadata.name = "Rdatasets"
23
+ @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
24
+ @metadata.licenses = ["GPL-3"]
25
+ @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
26
+ @data_path = cache_dir_path + "datasets.csv"
27
+ end
28
+
29
+ def filter(package: nil, dataset: nil)
30
+ return to_enum(__method__, package: package, dataset: dataset) unless block_given?
31
+
32
+ conds = {}
33
+ conds["Package"] = package if package
34
+ conds["Item"] = dataset if dataset
35
+ if conds.empty?
36
+ each_row {|row| yield Record.new(*row.fields) }
37
+ else
38
+ each_row do |row|
39
+ if conds.all? {|k, v| row[k] == v }
40
+ yield Record.new(*row.fields)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ def each(&block)
47
+ filter(&block)
48
+ end
49
+
50
+ private def each_row(&block)
51
+ download(@data_path, @data_url) unless @data_path.exist?
52
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
53
+ csv.each(&block)
54
+ end
55
+ end
56
+ end
57
+
58
+ class Rdatasets < Dataset
59
+ def initialize(package_name, dataset_name)
60
+ list = RdatasetsList.new
61
+
62
+ info = list.filter(package: package_name, dataset: dataset_name).first
63
+ unless info
64
+ raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
65
+ end
66
+
67
+ super()
68
+ @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
69
+ @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
70
+ @metadata.url = info.csv
71
+ @metadata.licenses = ["GPL-3"]
72
+ @metadata.description = info.title
73
+
74
+ # Follow the original directory structure in the cache directory
75
+ @data_path = cache_dir_path + (dataset_name + ".csv")
76
+
77
+ @package_name = package_name
78
+ @dataset_name = dataset_name
79
+ end
80
+
81
+ def each(&block)
82
+ return to_enum(__method__) unless block_given?
83
+
84
+ download(@data_path, @metadata.url) unless @data_path.exist?
85
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
86
+ csv.each do |row|
87
+ record = row.to_h
88
+ record.delete("")
89
+ record.transform_keys!(&:to_sym)
90
+ yield record
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,49 @@
1
+ module Datasets
2
+ class SeabornData < Dataset
3
+ URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
+
5
+ def initialize(name)
6
+ super()
7
+ @metadata.id = "seaborn-data-#{name}"
8
+ @metadata.name = "SeabornData: #{name}"
9
+ @metadata.url = URL_FORMAT % {name: name}
10
+
11
+ @data_path = cache_dir_path + (name + ".csv")
12
+ @name = name
13
+ end
14
+
15
+ def each(&block)
16
+ return to_enum(__method__) unless block_given?
17
+
18
+ download(@data_path, @metadata.url) unless @data_path.exist?
19
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
+ csv.each do |row|
21
+ record = prepare_record(row)
22
+ yield record
23
+ end
24
+ end
25
+ end
26
+
27
+ private
28
+ def prepare_record(csv_row)
29
+ record = csv_row.to_h
30
+ record.transform_keys!(&:to_sym)
31
+
32
+ # Perform the same preprocessing as seaborn's load_dataset function
33
+ preprocessor = :"preprocess_#{@name}_record"
34
+ __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
+
36
+ record
37
+ end
38
+
39
+ # The same preprocessing as seaborn.load_dataset
40
+ def preprocess_flights_record(record)
41
+ record[:month] &&= record[:month][0,3]
42
+ end
43
+
44
+ # The same preprocessing as seaborn.load_dataset
45
+ def preprocess_penguins_record(record)
46
+ record[:sex] &&= record[:sex].capitalize
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,169 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class SudachiSynonymDictionary < Dataset
7
+ class Synonym < Struct.new(:group_id,
8
+ :is_noun,
9
+ :expansion_type,
10
+ :lexeme_id,
11
+ :form_type,
12
+ :acronym_type,
13
+ :variant_type,
14
+ :categories,
15
+ :notation)
16
+ alias_method :noun?, :is_noun
17
+ end
18
+
19
+ def initialize
20
+ super()
21
+ @metadata.id = "sudachi-synonym-dictionary"
22
+ @metadata.name = "Sudachi synonym dictionary"
23
+ @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
24
+ @metadata.licenses = [
25
+ "Apache-2.0",
26
+ ]
27
+ @metadata.description = lambda do
28
+ download_description
29
+ end
30
+ end
31
+
32
+ def each
33
+ return to_enum(__method__) unless block_given?
34
+
35
+ lexeme_id_context = {}
36
+ open_data do |csv|
37
+ csv.each do |row|
38
+ group_id = row[0]
39
+ if group_id != lexeme_id_context[:group_id]
40
+ lexeme_id_context[:group_id] = group_id
41
+ lexeme_id_context[:counter] = 0
42
+ end
43
+ is_noun = (row[1] == "1")
44
+ expansion_type = normalize_expansion_type(row[2])
45
+ lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
46
+ form_type = normalize_form_type(row[4])
47
+ acronym_type = normalize_acronym_type(row[5])
48
+ variant_type = normalize_variant_type(row[6])
49
+ categories = normalize_categories(row[7])
50
+ notation = row[8]
51
+ synonym = Synonym.new(group_id,
52
+ is_noun,
53
+ expansion_type,
54
+ lexeme_id,
55
+ form_type,
56
+ acronym_type,
57
+ variant_type,
58
+ categories,
59
+ notation)
60
+ yield(synonym)
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def open_data
67
+ data_path = cache_dir_path + "synonyms.txt"
68
+ unless data_path.exist?
69
+ data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
70
+ download(data_path, data_url)
71
+ end
72
+ CSV.open(data_path, skip_blanks: true) do |csv|
73
+ yield(csv)
74
+ end
75
+ end
76
+
77
+ def download_description
78
+ description_path = cache_dir_path + "synonyms.md"
79
+ unless description_path.exist?
80
+ description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
81
+ download(description_path, description_url)
82
+ end
83
+ description_path.read
84
+ end
85
+
86
+ def normalize_expansion_type(type)
87
+ case type
88
+ when "0", ""
89
+ :always
90
+ when "1"
91
+ :expanded
92
+ when "2"
93
+ :never
94
+ else
95
+ raise Error, "unknown expansion type: #{type.inspect}"
96
+ end
97
+ end
98
+
99
+ def normalize_lexeme_id(id, context)
100
+ case id
101
+ when ""
102
+ lexeme_id_context[:counter] += 1
103
+ lexeme_id_context[:counter]
104
+ else
105
+ # Use only the first lexeme ID.
106
+ # Example:
107
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
108
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
109
+ Integer(id.split("/").first, 10)
110
+ end
111
+ end
112
+
113
+ def normalize_form_type(type)
114
+ case type
115
+ when "0", ""
116
+ :typical
117
+ when "1"
118
+ :translation
119
+ when "2"
120
+ :alias
121
+ when "3"
122
+ :old_name
123
+ when "4"
124
+ :misnomer
125
+ else
126
+ raise Error, "unknown form type: #{type.inspect}"
127
+ end
128
+ end
129
+
130
+ def normalize_acronym_type(type)
131
+ case type
132
+ when "0", ""
133
+ :typical
134
+ when "1"
135
+ :alphabet
136
+ when "2"
137
+ :others
138
+ else
139
+ raise Error, "unknown acronym type: #{type.inspect}"
140
+ end
141
+ end
142
+
143
+ def normalize_variant_type(type)
144
+ case type
145
+ when "0", ""
146
+ :typical
147
+ when "1"
148
+ :alphabet
149
+ when "2"
150
+ :general
151
+ when "3"
152
+ :misspelled
153
+ else
154
+ raise Error, "unknown variant type: #{type.inspect}"
155
+ end
156
+ end
157
+
158
+ def normalize_categories(categories)
159
+ case categories
160
+ when ""
161
+ nil
162
+ when /\A\((.*)\)\z/
163
+ $1.split("/")
164
+ else
165
+ raise Error, "invalid categories: #{categories.inspect}"
166
+ end
167
+ end
168
+ end
169
+ end
@@ -2,19 +2,99 @@ require "datasets/dictionary"
2
2
 
3
3
  module Datasets
4
4
  class Table
5
+ class Record
6
+ include Enumerable
7
+
8
+ def initialize(table, index)
9
+ @table = table
10
+ @index = index
11
+ end
12
+
13
+ def [](column_name_or_column_index)
14
+ @table[column_name_or_column_index][@index]
15
+ end
16
+
17
+ def each
18
+ return to_enum(__method__) unless block_given?
19
+ @table.each_column.each do |column_name, column_values|
20
+ yield(column_name, column_values[@index])
21
+ end
22
+ end
23
+
24
+ def values
25
+ @table.each_column.collect do |_column_name, column_values|
26
+ column_values[@index]
27
+ end
28
+ end
29
+
30
+ def to_h
31
+ hash = {}
32
+ each do |column_name, column_value|
33
+ hash[column_name] = column_value
34
+ end
35
+ hash
36
+ end
37
+
38
+ def inspect
39
+ "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
40
+ end
41
+ end
42
+
5
43
  include Enumerable
6
44
 
45
+ attr_reader :dataset
7
46
  def initialize(dataset)
8
47
  @dataset = dataset
9
48
  @dictionaries = {}
10
49
  end
11
50
 
12
- def each(&block)
51
+ def n_columns
52
+ columner_data.size
53
+ end
54
+ alias_method :size, :n_columns
55
+ alias_method :length, :n_columns
56
+
57
+ def n_rows
58
+ first_column = columner_data.first
59
+ return 0 if first_column.nil?
60
+ first_column[1].size
61
+ end
62
+
63
+ def column_names
64
+ columner_data.keys
65
+ end
66
+
67
+ def each_column(&block)
13
68
  columner_data.each(&block)
14
69
  end
70
+ alias_method :each, :each_column
15
71
 
16
- def [](name)
17
- columner_data[normalize_name(name)]
72
+ def each_record
73
+ return to_enum(__method__) unless block_given?
74
+ n_rows.times do |i|
75
+ yield(Record.new(self, i))
76
+ end
77
+ end
78
+
79
+ def find_record(row)
80
+ row += n_rows if row < 0
81
+ return nil if row < 0
82
+ return nil if row >= n_rows
83
+ Record.new(self, row)
84
+ end
85
+
86
+ def [](name_or_index)
87
+ case name_or_index
88
+ when Integer
89
+ index = name_or_index
90
+ columner_data.each_with_index do |(_name, values), i|
91
+ return values if i == index
92
+ end
93
+ nil
94
+ else
95
+ name = name_or_index
96
+ columner_data[normalize_name(name)]
97
+ end
18
98
  end
19
99
 
20
100
  def dictionary_encode(name)
@@ -0,0 +1,14 @@
1
+ require "rubygems/package"
2
+ require "zlib"
3
+
4
+ module Datasets
5
+ module TarGzReadable
6
+ def open_tar_gz(data_path)
7
+ Zlib::GzipReader.open(data_path) do |f|
8
+ Gem::Package::TarReader.new(f) do |tar|
9
+ yield(tar)
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.8"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -52,7 +52,7 @@ module Datasets
52
52
  end
53
53
 
54
54
  private
55
- def open_data
55
+ def open_data(&block)
56
56
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
57
  data_path = cache_dir_path + base_name
58
58
  unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
60
60
  download(data_path, data_url)
61
61
  end
62
62
 
63
- input, output = IO.pipe
64
- pid = spawn("bzcat", data_path.to_s, {:out => output})
65
- begin
66
- output.close
67
- yield(input)
68
- ensure
69
- input.close
70
- Process.waitpid(pid)
71
- end
63
+ extract_bz2(data_path, &block)
72
64
  end
73
65
 
74
66
  def type_in_path