ddi-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/ddi-parser.rb ADDED
@@ -0,0 +1,161 @@
1
+ require 'rubygems'
2
+ require 'libxml'
3
+ require 'models/catalog'
4
+ require 'models/category'
5
+ require 'models/category_statistic'
6
+ require 'models/study'
7
+ require 'models/study_date'
8
+ require 'models/variable'
9
+ require 'models/summary_stat'
10
+
11
+ module DDI
12
+ class Parser
13
+
14
+ #Given a DDI metadata file, parse it and return study information
15
+ #
16
+ #Returns a Nesstar::Study object
17
+ def parse ddi_file
18
+ catalog = DDI::Catalog.new
19
+ study = DDI::Study.new
20
+ study_info_hash = Hash.new
21
+ parser = LibXML::XML::Parser.file(ddi_file)
22
+ doc = parser.parse
23
+ studynodes = doc.find('//stdyDscr')
24
+ abstracts = studynodes[0].find('//abstract')
25
+ abstract = ""
26
+ abstracts.each do |ab|
27
+ abstract << ab.content.strip
28
+ end
29
+ abstract.strip!
30
+ study.abstract = abstract
31
+ study.title = studynodes[0].find('//stdyDscr/citation/titlStmt/titl')[0].first.content.strip
32
+ study.id = studynodes[0].find('//IDNo')[0].first.content.strip
33
+
34
+ #start and finish dates for study
35
+ dates = []
36
+ date = studynodes[0].find('//sumDscr/collDate')
37
+ date.each do |d|
38
+ a = d.attributes
39
+ study_date = DDI::StudyDate.new
40
+ study_date.type = a.get_attribute('event').value.strip
41
+ study_date.date = a.get_attribute('date').value.strip
42
+ dates.push(study_date)
43
+ end
44
+ study.dates = dates
45
+ study.sampling_procedure = studynodes[0].find('//sampProc')[0].first.content.strip unless studynodes[0].find('//sampProc')[0] == nil
46
+ # study.weight = studynodes[0].find('//sampProc')[0].first.content
47
+ study.variables = get_variable_information doc
48
+ return study
49
+ end
50
+
51
+ private
52
+
53
+ #information about the variables
54
+ def get_variable_information doc
55
+ variables = []
56
+ variable_info_hash = Hash.new
57
+ docnodes = doc.find('//dataDscr')
58
+ vargroups = docnodes[0].find('//dataDscr/varGrp')
59
+ vargroups.each do |vargroup|
60
+ #hash which holds all the variable groups
61
+ a = vargroup.attributes
62
+ groups = a.get_attribute('var')
63
+ if groups != nil
64
+ groups = a.get_attribute('var')
65
+ variable_info_hash[vargroup.find('./labl')[0].first.content] = groups.value.split(' ')
66
+ # else
67
+ # variable_info_hash[vargroup.find('./labl')[0].first.content] = groups.value.split(' ')
68
+ end
69
+ end
70
+ vars = docnodes[0].find('//dataDscr/var')
71
+ vars.each do |var|
72
+ variable = DDI::Variable.new
73
+ var_attr = var.attributes
74
+ variable.id = var_attr.get_attribute('ID').value.strip unless var_attr.get_attribute('ID') == nil
75
+ variable.name = var_attr.get_attribute('name').value.strip unless var_attr.get_attribute('name') == nil
76
+ variable.file = var_attr.get_attribute('files').value.strip unless var_attr.get_attribute('files') == nil
77
+ variable.interval = var_attr.get_attribute('intrvl').value.strip unless var_attr.get_attribute('intrvl') == nil
78
+ variable.label = var.find('./labl')[0].content.strip unless var.find('./labl')[0] == nil
79
+ rng = var.find('./valrng')
80
+ if rng != nil
81
+ if rng[0] != nil
82
+ range_attr = rng[0].first.attributes
83
+ max_val = range_attr.get_attribute('max')
84
+ variable.max = max_val.value.strip unless max_val == nil
85
+ min_val = range_attr.get_attribute('min')
86
+ variable.min = min_val.value.strip unless min_val == nil
87
+ end
88
+ end
89
+ q = var.find('./qstn')
90
+ if q[0] != nil
91
+ ql = q[0].find('./qstnLit')
92
+ if ql != nil
93
+ if ql[0] != nil
94
+ variable.question = ql[0].first.content.strip
95
+ end
96
+ end
97
+ iv = q[0].find('./ivuInstr')
98
+ if iv != nil
99
+ if iv[0] != nil
100
+ variable.interview_instruction = iv[0].first.content.strip
101
+ end
102
+ end
103
+ end
104
+ stats = var.find('./sumStat')
105
+ summary_stats = []
106
+ stats.each do |stat|
107
+ a = stat.attributes
108
+ # summary_stats[a.get_attribute('type').value] = stat.first.content
109
+ statistic = DDI::SummaryStat.new
110
+ statistic.type = a.get_attribute('type').value.strip
111
+ statistic.value = stat.first.content.strip
112
+ summary_stats.push(statistic)
113
+ end
114
+ variable.summary_stats = summary_stats
115
+ catgry = var.find('./catgry')
116
+ categories = []
117
+ #categories in ddi are value domains in mb
118
+ catgry.each do |cat|
119
+ category = DDI::Category.new
120
+ valxml = cat.find('./catValu')
121
+ if valxml != nil && valxml[0] != nil
122
+ category.value = valxml[0].first.content.strip unless valxml[0].first == nil
123
+ else
124
+ category.value = 'N/A'
125
+ end
126
+ labxml = cat.find('./labl')
127
+ if labxml != nil && labxml[0] != nil
128
+ category.label = labxml[0].first.content.strip unless labxml[0].first == nil
129
+ else
130
+ category.label = 'N/A'
131
+ end
132
+ catstats = cat.find('./catStat')
133
+ category_statistics = []
134
+ catstats.each do |catstat|
135
+ category_statistic = DDI::CategoryStatistic.new
136
+ a = catstat.attributes
137
+ if a != nil && a.get_attribute('type') != nil
138
+ category_statistic.type = a.get_attribute('type').value.strip
139
+ category_statistic.value = catstat.first.content.strip unless catstat.first == nil
140
+ category_statistics.push(category_statistic)
141
+ end
142
+ end
143
+ category.category_statistics = category_statistics
144
+ categories.push(category)
145
+ end
146
+ #what group is the variable in
147
+ variable_info_hash.each_key do |key|
148
+ if variable_info_hash[key].include?(variable.id)
149
+ variable.group = key.strip
150
+ break
151
+ end
152
+ end
153
+
154
+ variable.categories = categories
155
+ variables.push(variable)
156
+ end
157
+ return variables
158
+ end
159
+
160
+ end
161
+ end
@@ -0,0 +1,10 @@
1
+ module DDI
2
+ #A Nesstar catalog object, can contain 1 or more studies (ie datasets)
3
+ class Catalog
4
+
5
+ attr_reader :studies, :label, :description, :nesstar_id, :nesstar_uri
6
+ attr_writer :studies, :label, :description, :nesstar_id, :nesstar_uri
7
+
8
+ end
9
+
10
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Value information for the rows in a variable
4
+ class Category
5
+
6
+ attr_reader :value, :label, :category_statistics
7
+ attr_writer :value, :label, :category_statistics
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Stats about a category (ie value domain) belonging to a a variable
4
+ class CategoryStatistic
5
+
6
+ attr_reader :type, :value
7
+ attr_writer :type, :value
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Contains a set of variables and belongs to a catalog
4
+ class Study
5
+
6
+ attr_reader :variables, :abstract, :title, :id, :dates, :sampling_procedure, :weight, :nesstar_id, :nesstar_uri
7
+ attr_writer :variables, :abstract, :title, :id, :dates, :sampling_procedure, :weight, :nesstar_id, :nesstar_uri
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Dates that are important to the study eg. start date or end date
4
+ class StudyDate
5
+
6
+ attr_reader :type, :date
7
+ attr_writer :type, :date
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Stats at the variable level, eg how many valid or invalid rows
4
+ class SummaryStat
5
+
6
+ attr_reader :type, :value
7
+ attr_writer :type, :value
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module DDI
2
+
3
+ #Information about a variable/column in a dataset
4
+ class Variable
5
+
6
+ attr_reader :name, :label, :group, :id, :file, :interval, :max, :min, :question, :interview_instruction, :summary_stats, :categories
7
+ attr_writer :name, :label, :group, :id, :file, :interval, :max, :min, :question, :interview_instruction, :summary_stats, :categories
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,5 @@
1
+ module DDI
2
+ module Parser
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ddi-parser
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Ian Dunlop
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-05-06 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: libxml-ruby
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - "="
28
+ - !ruby/object:Gem::Version
29
+ hash: 27
30
+ segments:
31
+ - 1
32
+ - 1
33
+ - 4
34
+ version: 1.1.4
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ description: This gem parses ddi metadata files
38
+ email:
39
+ - ian.dunlop@manchester.ac.uk
40
+ executables: []
41
+
42
+ extensions: []
43
+
44
+ extra_rdoc_files: []
45
+
46
+ files:
47
+ - lib/ddi-parser.rb
48
+ - lib/models/catalog.rb
49
+ - lib/models/category.rb
50
+ - lib/models/category_statistic.rb
51
+ - lib/models/study.rb
52
+ - lib/models/study_date.rb
53
+ - lib/models/summary_stat.rb
54
+ - lib/models/variable.rb
55
+ - lib/models/version.rb
56
+ has_rdoc: true
57
+ homepage: http://github.com/mygrid/ddi-parser
58
+ licenses: []
59
+
60
+ post_install_message:
61
+ rdoc_options: []
62
+
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ hash: 3
71
+ segments:
72
+ - 0
73
+ version: "0"
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ hash: 3
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ requirements: []
84
+
85
+ rubyforge_project: ddi-parser
86
+ rubygems_version: 1.3.7
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: API for parsing ddi metadata files and returning results
90
+ test_files: []
91
+