ddi-parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/ddi-parser.rb ADDED
@@ -0,0 +1,161 @@
1
+ require 'rubygems'
2
+ require 'libxml'
3
+ require 'models/catalog'
4
+ require 'models/category'
5
+ require 'models/category_statistic'
6
+ require 'models/study'
7
+ require 'models/study_date'
8
+ require 'models/variable'
9
+ require 'models/summary_stat'
10
+
11
+ module DDI
12
+ class Parser
13
+
14
+ #Given a DDI metadata file, parse it and return study information
15
+ #
16
+ #Returns a Nesstar::Study object
17
+ def parse ddi_file
18
+ catalog = DDI::Catalog.new
19
+ study = DDI::Study.new
20
+ study_info_hash = Hash.new
21
+ parser = LibXML::XML::Parser.file(ddi_file)
22
+ doc = parser.parse
23
+ studynodes = doc.find('//stdyDscr')
24
+ abstracts = studynodes[0].find('//abstract')
25
+ abstract = ""
26
+ abstracts.each do |ab|
27
+ abstract << ab.content.strip
28
+ end
29
+ abstract.strip!
30
+ study.abstract = abstract
31
+ study.title = studynodes[0].find('//stdyDscr/citation/titlStmt/titl')[0].first.content.strip
32
+ study.id = studynodes[0].find('//IDNo')[0].first.content.strip
33
+
34
+ #start and finish dates for study
35
+ dates = []
36
+ date = studynodes[0].find('//sumDscr/collDate')
37
+ date.each do |d|
38
+ a = d.attributes
39
+ study_date = DDI::StudyDate.new
40
+ study_date.type = a.get_attribute('event').value.strip
41
+ study_date.date = a.get_attribute('date').value.strip
42
+ dates.push(study_date)
43
+ end
44
+ study.dates = dates
45
+ study.sampling_procedure = studynodes[0].find('//sampProc')[0].first.content.strip unless studynodes[0].find('//sampProc')[0] == nil
46
+ # study.weight = studynodes[0].find('//sampProc')[0].first.content
47
+ study.variables = get_variable_information doc
48
+ return study
49
+ end
50
+
51
+ private
52
+
53
+ #information about the variables
54
+ def get_variable_information doc
55
+ variables = []
56
+ variable_info_hash = Hash.new
57
+ docnodes = doc.find('//dataDscr')
58
+ vargroups = docnodes[0].find('//dataDscr/varGrp')
59
+ vargroups.each do |vargroup|
60
+ #hash which holds all the variable groups
61
+ a = vargroup.attributes
62
+ groups = a.get_attribute('var')
63
+ if groups != nil
64
+ groups = a.get_attribute('var')
65
+ variable_info_hash[vargroup.find('./labl')[0].first.content] = groups.value.split(' ')
66
+ # else
67
+ # variable_info_hash[vargroup.find('./labl')[0].first.content] = groups.value.split(' ')
68
+ end
69
+ end
70
+ vars = docnodes[0].find('//dataDscr/var')
71
+ vars.each do |var|
72
+ variable = DDI::Variable.new
73
+ var_attr = var.attributes
74
+ variable.id = var_attr.get_attribute('ID').value.strip unless var_attr.get_attribute('ID') == nil
75
+ variable.name = var_attr.get_attribute('name').value.strip unless var_attr.get_attribute('name') == nil
76
+ variable.file = var_attr.get_attribute('files').value.strip unless var_attr.get_attribute('files') == nil
77
+ variable.interval = var_attr.get_attribute('intrvl').value.strip unless var_attr.get_attribute('intrvl') == nil
78
+ variable.label = var.find('./labl')[0].content.strip unless var.find('./labl')[0] == nil
79
+ rng = var.find('./valrng')
80
+ if rng != nil
81
+ if rng[0] != nil
82
+ range_attr = rng[0].first.attributes
83
+ max_val = range_attr.get_attribute('max')
84
+ variable.max = max_val.value.strip unless max_val == nil
85
+ min_val = range_attr.get_attribute('min')
86
+ variable.min = min_val.value.strip unless min_val == nil
87
+ end
88
+ end
89
+ q = var.find('./qstn')
90
+ if q[0] != nil
91
+ ql = q[0].find('./qstnLit')
92
+ if ql != nil
93
+ if ql[0] != nil
94
+ variable.question = ql[0].first.content.strip
95
+ end
96
+ end
97
+ iv = q[0].find('./ivuInstr')
98
+ if iv != nil
99
+ if iv[0] != nil
100
+ variable.interview_instruction = iv[0].first.content.strip
101
+ end
102
+ end
103
+ end
104
+ stats = var.find('./sumStat')
105
+ summary_stats = []
106
+ stats.each do |stat|
107
+ a = stat.attributes
108
+ # summary_stats[a.get_attribute('type').value] = stat.first.content
109
+ statistic = DDI::SummaryStat.new
110
+ statistic.type = a.get_attribute('type').value.strip
111
+ statistic.value = stat.first.content.strip
112
+ summary_stats.push(statistic)
113
+ end
114
+ variable.summary_stats = summary_stats
115
+ catgry = var.find('./catgry')
116
+ categories = []
117
+ #categories in ddi are value domains in mb
118
+ catgry.each do |cat|
119
+ category = DDI::Category.new
120
+ valxml = cat.find('./catValu')
121
+ if valxml != nil && valxml[0] != nil
122
+ category.value = valxml[0].first.content.strip unless valxml[0].first == nil
123
+ else
124
+ category.value = 'N/A'
125
+ end
126
+ labxml = cat.find('./labl')
127
+ if labxml != nil && labxml[0] != nil
128
+ category.label = labxml[0].first.content.strip unless labxml[0].first == nil
129
+ else
130
+ category.label = 'N/A'
131
+ end
132
+ catstats = cat.find('./catStat')
133
+ category_statistics = []
134
+ catstats.each do |catstat|
135
+ category_statistic = DDI::CategoryStatistic.new
136
+ a = catstat.attributes
137
+ if a != nil && a.get_attribute('type') != nil
138
+ category_statistic.type = a.get_attribute('type').value.strip
139
+ category_statistic.value = catstat.first.content.strip unless catstat.first == nil
140
+ category_statistics.push(category_statistic)
141
+ end
142
+ end
143
+ category.category_statistics = category_statistics
144
+ categories.push(category)
145
+ end
146
+ #what group is the variable in
147
+ variable_info_hash.each_key do |key|
148
+ if variable_info_hash[key].include?(variable.id)
149
+ variable.group = key.strip
150
+ break
151
+ end
152
+ end
153
+
154
+ variable.categories = categories
155
+ variables.push(variable)
156
+ end
157
+ return variables
158
+ end
159
+
160
+ end
161
+ end
@@ -0,0 +1,10 @@
1
+ module DDI
2
+ #A Nesstar catalog object, can contain 1 or more studies (ie datasets)
3
+ class Catalog
4
+
5
+ attr_reader :studies, :label, :description, :nesstar_id, :nesstar_uri
6
+ attr_writer :studies, :label, :description, :nesstar_id, :nesstar_uri
7
+
8
+ end
9
+
10
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Value information for the rows in a variable
4
+ class Category
5
+
6
+ attr_reader :value, :label, :category_statistics
7
+ attr_writer :value, :label, :category_statistics
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Stats about a category (ie value domain) belonging to a a variable
4
+ class CategoryStatistic
5
+
6
+ attr_reader :type, :value
7
+ attr_writer :type, :value
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Contains a set of variables and belongs to a catalog
4
+ class Study
5
+
6
+ attr_reader :variables, :abstract, :title, :id, :dates, :sampling_procedure, :weight, :nesstar_id, :nesstar_uri
7
+ attr_writer :variables, :abstract, :title, :id, :dates, :sampling_procedure, :weight, :nesstar_id, :nesstar_uri
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Dates that are important to the study eg. start date or end date
4
+ class StudyDate
5
+
6
+ attr_reader :type, :date
7
+ attr_writer :type, :date
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Stats at the variable level, eg how many valid or invalid rows
4
+ class SummaryStat
5
+
6
+ attr_reader :type, :value
7
+ attr_writer :type, :value
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Information about a variable/column in a dataset
4
+ class Variable
5
+
6
+ attr_reader :name, :label, :group, :id, :file, :interval, :max, :min, :question, :interview_instruction, :summary_stats, :categories
7
+ attr_writer :name, :label, :group, :id, :file, :interval, :max, :min, :question, :interview_instruction, :summary_stats, :categories
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,5 @@
1
+ module DDI
2
+ module Parser
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ddi-parser
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Ian Dunlop
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-05-06 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: libxml-ruby
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - "="
28
+ - !ruby/object:Gem::Version
29
+ hash: 27
30
+ segments:
31
+ - 1
32
+ - 1
33
+ - 4
34
+ version: 1.1.4
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ description: This gem parses ddi metadata files
38
+ email:
39
+ - ian.dunlop@manchester.ac.uk
40
+ executables: []
41
+
42
+ extensions: []
43
+
44
+ extra_rdoc_files: []
45
+
46
+ files:
47
+ - lib/ddi-parser.rb
48
+ - lib/models/catalog.rb
49
+ - lib/models/category.rb
50
+ - lib/models/category_statistic.rb
51
+ - lib/models/study.rb
52
+ - lib/models/study_date.rb
53
+ - lib/models/summary_stat.rb
54
+ - lib/models/variable.rb
55
+ - lib/models/version.rb
56
+ has_rdoc: true
57
+ homepage: http://github.com/mygrid/ddi-parser
58
+ licenses: []
59
+
60
+ post_install_message:
61
+ rdoc_options: []
62
+
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ hash: 3
71
+ segments:
72
+ - 0
73
+ version: "0"
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ hash: 3
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ requirements: []
84
+
85
+ rubyforge_project: ddi-parser
86
+ rubygems_version: 1.3.7
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: API for parsing ddi metadata files and returning results
90
+ test_files: []
91
+