red-datasets 0.0.8 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,95 @@
1
+ require_relative "dataset"
2
+ require_relative "tar-gz-readable"
3
+
4
+ module Datasets
5
+ class RdatasetsList < Dataset
6
+ Record = Struct.new(:package,
7
+ :dataset,
8
+ :title,
9
+ :rows,
10
+ :cols,
11
+ :n_binary,
12
+ :n_character,
13
+ :n_factor,
14
+ :n_logical,
15
+ :n_numeric,
16
+ :csv,
17
+ :doc)
18
+
19
+ def initialize
20
+ super
21
+ @metadata.id = "rdatasets"
22
+ @metadata.name = "Rdatasets"
23
+ @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
24
+ @metadata.licenses = ["GPL-3"]
25
+ @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
26
+ @data_path = cache_dir_path + "datasets.csv"
27
+ end
28
+
29
+ def filter(package: nil, dataset: nil)
30
+ return to_enum(__method__, package: package, dataset: dataset) unless block_given?
31
+
32
+ conds = {}
33
+ conds["Package"] = package if package
34
+ conds["Item"] = dataset if dataset
35
+ if conds.empty?
36
+ each_row {|row| yield Record.new(*row.fields) }
37
+ else
38
+ each_row do |row|
39
+ if conds.all? {|k, v| row[k] == v }
40
+ yield Record.new(*row.fields)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ def each(&block)
47
+ filter(&block)
48
+ end
49
+
50
+ private def each_row(&block)
51
+ download(@data_path, @data_url) unless @data_path.exist?
52
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
53
+ csv.each(&block)
54
+ end
55
+ end
56
+ end
57
+
58
+ class Rdatasets < Dataset
59
+ def initialize(package_name, dataset_name)
60
+ list = RdatasetsList.new
61
+
62
+ info = list.filter(package: package_name, dataset: dataset_name).first
63
+ unless info
64
+ raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
65
+ end
66
+
67
+ super()
68
+ @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
69
+ @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
70
+ @metadata.url = info.csv
71
+ @metadata.licenses = ["GPL-3"]
72
+ @metadata.description = info.title
73
+
74
+ # Follow the original directory structure in the cache directory
75
+ @data_path = cache_dir_path + (dataset_name + ".csv")
76
+
77
+ @package_name = package_name
78
+ @dataset_name = dataset_name
79
+ end
80
+
81
+ def each(&block)
82
+ return to_enum(__method__) unless block_given?
83
+
84
+ download(@data_path, @metadata.url) unless @data_path.exist?
85
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
86
+ csv.each do |row|
87
+ record = row.to_h
88
+ record.delete("")
89
+ record.transform_keys!(&:to_sym)
90
+ yield record
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,49 @@
1
+ module Datasets
2
+ class SeabornData < Dataset
3
+ URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
+
5
+ def initialize(name)
6
+ super()
7
+ @metadata.id = "seaborn-data-#{name}"
8
+ @metadata.name = "SeabornData: #{name}"
9
+ @metadata.url = URL_FORMAT % {name: name}
10
+
11
+ @data_path = cache_dir_path + (name + ".csv")
12
+ @name = name
13
+ end
14
+
15
+ def each(&block)
16
+ return to_enum(__method__) unless block_given?
17
+
18
+ download(@data_path, @metadata.url) unless @data_path.exist?
19
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
+ csv.each do |row|
21
+ record = prepare_record(row)
22
+ yield record
23
+ end
24
+ end
25
+ end
26
+
27
+ private
28
+ def prepare_record(csv_row)
29
+ record = csv_row.to_h
30
+ record.transform_keys!(&:to_sym)
31
+
32
+ # Perform the same preprocessing as seaborn's load_dataset function
33
+ preprocessor = :"preprocess_#{@name}_record"
34
+ __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
+
36
+ record
37
+ end
38
+
39
+ # The same preprocessing as seaborn.load_dataset
40
+ def preprocess_flights_record(record)
41
+ record[:month] &&= record[:month][0,3]
42
+ end
43
+
44
+ # The same preprocessing as seaborn.load_dataset
45
+ def preprocess_penguins_record(record)
46
+ record[:sex] &&= record[:sex].capitalize
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,169 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class SudachiSynonymDictionary < Dataset
7
+ class Synonym < Struct.new(:group_id,
8
+ :is_noun,
9
+ :expansion_type,
10
+ :lexeme_id,
11
+ :form_type,
12
+ :acronym_type,
13
+ :variant_type,
14
+ :categories,
15
+ :notation)
16
+ alias_method :noun?, :is_noun
17
+ end
18
+
19
+ def initialize
20
+ super()
21
+ @metadata.id = "sudachi-synonym-dictionary"
22
+ @metadata.name = "Sudachi synonym dictionary"
23
+ @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
24
+ @metadata.licenses = [
25
+ "Apache-2.0",
26
+ ]
27
+ @metadata.description = lambda do
28
+ download_description
29
+ end
30
+ end
31
+
32
+ def each
33
+ return to_enum(__method__) unless block_given?
34
+
35
+ lexeme_id_context = {}
36
+ open_data do |csv|
37
+ csv.each do |row|
38
+ group_id = row[0]
39
+ if group_id != lexeme_id_context[:group_id]
40
+ lexeme_id_context[:group_id] = group_id
41
+ lexeme_id_context[:counter] = 0
42
+ end
43
+ is_noun = (row[1] == "1")
44
+ expansion_type = normalize_expansion_type(row[2])
45
+ lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
46
+ form_type = normalize_form_type(row[4])
47
+ acronym_type = normalize_acronym_type(row[5])
48
+ variant_type = normalize_variant_type(row[6])
49
+ categories = normalize_categories(row[7])
50
+ notation = row[8]
51
+ synonym = Synonym.new(group_id,
52
+ is_noun,
53
+ expansion_type,
54
+ lexeme_id,
55
+ form_type,
56
+ acronym_type,
57
+ variant_type,
58
+ categories,
59
+ notation)
60
+ yield(synonym)
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def open_data
67
+ data_path = cache_dir_path + "synonyms.txt"
68
+ unless data_path.exist?
69
+ data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
70
+ download(data_path, data_url)
71
+ end
72
+ CSV.open(data_path, skip_blanks: true) do |csv|
73
+ yield(csv)
74
+ end
75
+ end
76
+
77
+ def download_description
78
+ description_path = cache_dir_path + "synonyms.md"
79
+ unless description_path.exist?
80
+ description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
81
+ download(description_path, description_url)
82
+ end
83
+ description_path.read
84
+ end
85
+
86
+ def normalize_expansion_type(type)
87
+ case type
88
+ when "0", ""
89
+ :always
90
+ when "1"
91
+ :expanded
92
+ when "2"
93
+ :never
94
+ else
95
+ raise Error, "unknown expansion type: #{type.inspect}"
96
+ end
97
+ end
98
+
99
+ def normalize_lexeme_id(id, context)
100
+ case id
101
+ when ""
102
+ lexeme_id_context[:counter] += 1
103
+ lexeme_id_context[:counter]
104
+ else
105
+ # Use only the first lexeme ID.
106
+ # Example:
107
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
108
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
109
+ Integer(id.split("/").first, 10)
110
+ end
111
+ end
112
+
113
+ def normalize_form_type(type)
114
+ case type
115
+ when "0", ""
116
+ :typical
117
+ when "1"
118
+ :translation
119
+ when "2"
120
+ :alias
121
+ when "3"
122
+ :old_name
123
+ when "4"
124
+ :misnomer
125
+ else
126
+ raise Error, "unknown form type: #{type.inspect}"
127
+ end
128
+ end
129
+
130
+ def normalize_acronym_type(type)
131
+ case type
132
+ when "0", ""
133
+ :typical
134
+ when "1"
135
+ :alphabet
136
+ when "2"
137
+ :others
138
+ else
139
+ raise Error, "unknown acronym type: #{type.inspect}"
140
+ end
141
+ end
142
+
143
+ def normalize_variant_type(type)
144
+ case type
145
+ when "0", ""
146
+ :typical
147
+ when "1"
148
+ :alphabet
149
+ when "2"
150
+ :general
151
+ when "3"
152
+ :misspelled
153
+ else
154
+ raise Error, "unknown variant type: #{type.inspect}"
155
+ end
156
+ end
157
+
158
+ def normalize_categories(categories)
159
+ case categories
160
+ when ""
161
+ nil
162
+ when /\A\((.*)\)\z/
163
+ $1.split("/")
164
+ else
165
+ raise Error, "invalid categories: #{categories.inspect}"
166
+ end
167
+ end
168
+ end
169
+ end
@@ -2,19 +2,99 @@ require "datasets/dictionary"
2
2
 
3
3
  module Datasets
4
4
  class Table
5
+ class Record
6
+ include Enumerable
7
+
8
+ def initialize(table, index)
9
+ @table = table
10
+ @index = index
11
+ end
12
+
13
+ def [](column_name_or_column_index)
14
+ @table[column_name_or_column_index][@index]
15
+ end
16
+
17
+ def each
18
+ return to_enum(__method__) unless block_given?
19
+ @table.each_column.each do |column_name, column_values|
20
+ yield(column_name, column_values[@index])
21
+ end
22
+ end
23
+
24
+ def values
25
+ @table.each_column.collect do |_column_name, column_values|
26
+ column_values[@index]
27
+ end
28
+ end
29
+
30
+ def to_h
31
+ hash = {}
32
+ each do |column_name, column_value|
33
+ hash[column_name] = column_value
34
+ end
35
+ hash
36
+ end
37
+
38
+ def inspect
39
+ "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
40
+ end
41
+ end
42
+
5
43
  include Enumerable
6
44
 
45
+ attr_reader :dataset
7
46
  def initialize(dataset)
8
47
  @dataset = dataset
9
48
  @dictionaries = {}
10
49
  end
11
50
 
12
- def each(&block)
51
+ def n_columns
52
+ columner_data.size
53
+ end
54
+ alias_method :size, :n_columns
55
+ alias_method :length, :n_columns
56
+
57
+ def n_rows
58
+ first_column = columner_data.first
59
+ return 0 if first_column.nil?
60
+ first_column[1].size
61
+ end
62
+
63
+ def column_names
64
+ columner_data.keys
65
+ end
66
+
67
+ def each_column(&block)
13
68
  columner_data.each(&block)
14
69
  end
70
+ alias_method :each, :each_column
15
71
 
16
- def [](name)
17
- columner_data[normalize_name(name)]
72
+ def each_record
73
+ return to_enum(__method__) unless block_given?
74
+ n_rows.times do |i|
75
+ yield(Record.new(self, i))
76
+ end
77
+ end
78
+
79
+ def find_record(row)
80
+ row += n_rows if row < 0
81
+ return nil if row < 0
82
+ return nil if row >= n_rows
83
+ Record.new(self, row)
84
+ end
85
+
86
+ def [](name_or_index)
87
+ case name_or_index
88
+ when Integer
89
+ index = name_or_index
90
+ columner_data.each_with_index do |(_name, values), i|
91
+ return values if i == index
92
+ end
93
+ nil
94
+ else
95
+ name = name_or_index
96
+ columner_data[normalize_name(name)]
97
+ end
18
98
  end
19
99
 
20
100
  def dictionary_encode(name)
@@ -0,0 +1,14 @@
1
+ require "rubygems/package"
2
+ require "zlib"
3
+
4
+ module Datasets
5
+ module TarGzReadable
6
+ def open_tar_gz(data_path)
7
+ Zlib::GzipReader.open(data_path) do |f|
8
+ Gem::Package::TarReader.new(f) do |tar|
9
+ yield(tar)
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.8"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -52,7 +52,7 @@ module Datasets
52
52
  end
53
53
 
54
54
  private
55
- def open_data
55
+ def open_data(&block)
56
56
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
57
  data_path = cache_dir_path + base_name
58
58
  unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
60
60
  download(data_path, data_url)
61
61
  end
62
62
 
63
- input, output = IO.pipe
64
- pid = spawn("bzcat", data_path.to_s, {:out => output})
65
- begin
66
- output.close
67
- yield(input)
68
- ensure
69
- input.close
70
- Process.waitpid(pid)
71
- end
63
+ extract_bz2(data_path, &block)
72
64
  end
73
65
 
74
66
  def type_in_path