mida 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,8 @@
1
+ == 0.3.1 (5th July 2011)
2
+ * Add bin/mida exectuable
3
+ * Changed <tt>Item#to_h</tt> to only return keys with values
4
+ * Add validate option to +Document+ and +Item+ to turn on/off validation
5
+
1
6
  == 0.3.0 (29th June 2011)
2
7
  * Merge +VocabularyDesc+ into +Vocabulary+
3
8
  * Vocabularies are now auto registered using +inherited+ hook
data/README.rdoc CHANGED
@@ -18,7 +18,20 @@ Mida keeps RubyGems[http://rubygems.org/gems/mida] up-to-date with its latest ve
18
18
 
19
19
  * +Nokogiri+
20
20
 
21
- == Usage
21
+ == Command Line Usage
22
+
23
+ To use the command line tool, supply it with the urls or filenames that you
24
+ would like to be parsed (by default each item is output as yaml):
25
+ mida http://lawrencewoodman.github.com/mida/news/
26
+
27
+ If you want to search for specific types you can use the <tt>-t</tt> switch
28
+ followed by a Regular Expression:
29
+ mida -t /person/i http://lawrencewoodman.github.com/mida/news/
30
+
31
+ For more information look at <tt>mida</tt>'s help:
32
+ mida -h
33
+
34
+ == Library Usage
22
35
  The following examples assume that you have required +mida+ and
23
36
  <tt>open-uri</tt>.
24
37
 
data/Rakefile CHANGED
@@ -1,24 +1,25 @@
1
1
  task :default => :spec
2
2
 
3
3
  desc "Create Gem"
4
- require 'rake/gempackagetask'
4
+ require 'rubygems/package_task'
5
5
  spec = Gem::Specification.new do |s|
6
6
  s.name = "mida"
7
7
  s.summary = "A Microdata parser/extractor library"
8
8
  s.description = "A Microdata parser and extractor library, based on the latest published version of the Microdata Specification, dated 5th April 2011."
9
- s.version = "0.3.0"
9
+ s.version = "0.3.1"
10
10
  s.author = "Lawrence Woodman"
11
11
  s.email = "lwoodman@vlifesystems.com"
12
12
  s.homepage = %q{http://lawrencewoodman.github.com/mida/}
13
13
  s.platform = Gem::Platform::RUBY
14
14
  s.required_ruby_version = '>=1.9'
15
15
  s.files = Dir['lib/**/*.rb'] + Dir['spec/**/*.rb'] + Dir['*.rdoc'] + Dir['Rakefile']
16
+ s.executables = ['mida']
16
17
  s.extra_rdoc_files = ['README.rdoc', 'LICENSE.rdoc', 'CHANGELOG.rdoc']
17
18
  s.rdoc_options << '--main' << 'README.rdoc'
18
- s.add_dependency('nokogiri')
19
+ s.add_dependency('nokogiri', '>= 1.5')
19
20
  s.add_development_dependency('rspec', '>= 2.0' )
20
21
  end
21
- Rake::GemPackageTask.new(spec).define
22
+ Gem::PackageTask.new(spec).define
22
23
 
23
24
  desc "Run Specs"
24
25
  require 'rspec/core/rake_task'
data/bin/mida ADDED
@@ -0,0 +1,110 @@
1
+ #!/usr/bin/env ruby
2
+ ## Mida: Microdata parser/extractor
3
+ ##
4
+ ## Usage: mida [options...] [sources...]
5
+ ##
6
+ ## Find the Microdata in the given 'sources', which can be urls or files.
7
+ ## Urls must be prefixed with: http://
8
+ ##
9
+
10
+ require 'open-uri'
11
+ require 'yaml'
12
+ require 'optparse'
13
+
14
+ # Displays comment at top of file
15
+ def banner
16
+ File.readlines(__FILE__).
17
+ grep(/^##.*/).
18
+ map { |line| line.chomp[3..-1] }.
19
+ join("\n")+"\n"+" Options:\n"
20
+ end
21
+
22
+ begin
23
+ require 'mida'
24
+ rescue LoadError
25
+ raise if $!.to_s !~ /mida/
26
+ libdir = File.expand_path("../../lib", __FILE__).sub(/^#{Dir.pwd}/, '.')
27
+ if !$:.include?(libdir)
28
+ warn "warn: #{$!.to_s}. trying again with #{libdir} on load path."
29
+ $:.unshift libdir
30
+ retry
31
+ end
32
+ raise
33
+ end
34
+
35
+ options = {sourcename: true, validate: true}
36
+ ARGV.options do |option|
37
+ option.banner = banner
38
+ option.on('-c','--count', 'Display the counts of each Microdata Type') do
39
+ options[:count] = true
40
+ end
41
+ option.on('-n','--no-sourcename', "Don't display the source name") do
42
+ options[:sourcename] = false
43
+ end
44
+ option.on('-t','--type TYPE', Regexp,
45
+ 'A regexp to match the itemtypes against') do |type|
46
+ options[:type] = type
47
+ end
48
+ option.on('-v','--no-validate',
49
+ "Don't validate the items against known Vocabularies") do
50
+ options[:validate] = false
51
+ end
52
+ option.on_tail('-h','--help', 'This help message') {puts option; exit}
53
+
54
+ begin
55
+ option.parse!
56
+ rescue OptionParser::InvalidOption => error
57
+ puts "#{error}\n#{option}"; exit
58
+ end
59
+ if ARGV.empty? then puts option; exit end
60
+ end
61
+
62
+ # Get the url from the source if there is one
63
+ def get_url
64
+ ARGV.first =~ %r{^http://.*} ? ARGV.first : nil
65
+ end
66
+
67
+ # Display each item as yaml
68
+ def display_items(items)
69
+ items.each {|item| puts item.to_h.to_yaml}
70
+ end
71
+
72
+ # Returns a hash {type => count}
73
+ def count_types(types)
74
+ types.each_with_object(Hash.new(0)) {|type,count| count[type] += 1}
75
+ end
76
+
77
+ # Display the number of each type of item
78
+ def display_count(items)
79
+ types = items.collect {|item| item.type}
80
+ count_types(types).each {|type, count| puts "Found #{count} #{type}"}
81
+ end
82
+
83
+ def parse_source(source, options)
84
+ url = get_url
85
+ begin
86
+ open(source) do |f|
87
+ doc = Mida::Document.new(f, url, options[:validate])
88
+ type = options[:type] || %r{}
89
+ items = doc.search(type)
90
+ if items.empty?
91
+ puts "No microdata found in this document."; exit
92
+ else
93
+ if options.include?(:count)
94
+ display_count(items)
95
+ else
96
+ display_items(items)
97
+ end
98
+ end
99
+ end
100
+ rescue
101
+ puts "Failed to parse: #{source}"
102
+ exit
103
+ end
104
+ end
105
+
106
+ ARGV.each do |source|
107
+ puts "Parsing: #{source}" if options[:sourcename]
108
+ parse_source(source, options)
109
+ puts
110
+ end
data/lib/mida/document.rb CHANGED
@@ -15,10 +15,11 @@ module Mida
15
15
  # [target] The string containing the html that you want to parse.
16
16
  # [page_url] The url of target used for form absolute urls. This must
17
17
  # include the filename, e.g. index.html.
18
- def initialize(target, page_url=nil)
18
+ # [validate] Whether to validate the items against known vocabularies.
19
+ def initialize(target, page_url=nil, validate=true)
19
20
  @doc = Nokogiri(target)
20
21
  @page_url = page_url
21
- @items = extract_items
22
+ @items = extract_items(validate)
22
23
  end
23
24
 
24
25
  # Implements method for Enumerable
@@ -45,13 +46,13 @@ module Mida
45
46
  end
46
47
 
47
48
  private
48
- def extract_items
49
+ def extract_items(validate)
49
50
  itemscopes = @doc.search('//*[@itemscope and not(@itemprop)]')
50
51
  return nil unless itemscopes
51
52
 
52
53
  itemscopes.collect do |itemscope|
53
54
  itemscope = Itemscope.new(itemscope, @page_url)
54
- Item.new(itemscope)
55
+ Item.new(itemscope, validate)
55
56
  end
56
57
  end
57
58
 
data/lib/mida/item.rb CHANGED
@@ -24,12 +24,13 @@ module Mida
24
24
  # its +properties+
25
25
  #
26
26
  # [itemscope] The itemscope that has been parsed by +Itemscope+
27
- def initialize(itemscope)
27
+ # [validate] Whether to validate the item against known vocabularies
28
+ def initialize(itemscope, validate=true)
28
29
  @type = itemscope.type
29
30
  @id = itemscope.id
30
31
  @vocabulary = Mida::Vocabulary.find(@type)
31
32
  @properties = itemscope.properties
32
- validate_properties
33
+ validate_properties if validate
33
34
  end
34
35
 
35
36
  # Return a Hash representation
@@ -39,7 +40,12 @@ module Mida
39
40
  # properties: {'a name' => 'avalue' }
40
41
  # }
41
42
  def to_h
42
- {type: @type, id: @id, properties: properties_to_h(@properties)}
43
+ # Only fill hash with non-nil values
44
+ hash = {}
45
+ @type and hash[:type] = @type
46
+ @id and hash[:id] = @id
47
+ @properties.any? and hash[:properties] = properties_to_h(@properties)
48
+ hash
43
49
  end
44
50
 
45
51
  def to_s
@@ -58,7 +64,11 @@ module Mida
58
64
  @properties =
59
65
  @properties.each_with_object({}) do |(property, values), hash|
60
66
  valid_values = validate_values(property, values)
61
- hash[property] = valid_values unless valid_values.nil?
67
+ if valid_values.respond_to?(:any?)
68
+ hash[property] = valid_values if valid_values.any?
69
+ else
70
+ hash[property] = valid_values
71
+ end
62
72
  end
63
73
  end
64
74
 
@@ -77,9 +87,9 @@ module Mida
77
87
  # Return valid values, converted to the correct +DataType+
78
88
  # or +Item+ and number if necessary
79
89
  def validate_values(property, values)
80
- return nil unless valid_property?(property, values)
90
+ return [] unless valid_property?(property, values)
81
91
  prop_num = property_number(property)
82
- return nil unless valid_num_values?(prop_num, values)
92
+ return [] unless valid_num_values?(prop_num, values)
83
93
  prop_types = property_types(property)
84
94
 
85
95
  valid_values = values.each_with_object([]) do |value, valid_values|
@@ -30,16 +30,6 @@ def match_array(value_array, expected_results)
30
30
  end
31
31
  end
32
32
 
33
- shared_examples_for 'one root itemscope' do
34
- it 'should not match itemscopes with different names' do
35
- @md.search(%r{nothing}).size.should == 0
36
- end
37
-
38
- it 'should find the correct number of itemscopes' do
39
- @md.items.size.should == 1
40
- end
41
- end
42
-
43
33
  describe Mida::Document do
44
34
  before do
45
35
  html = '
@@ -78,6 +68,49 @@ describe Mida::Document do
78
68
  end
79
69
  end
80
70
 
71
+ describe Mida::Document, 'when initialized' do
72
+ before do
73
+ @html = '
74
+ <html><body>
75
+ <div itemscope itemtype="http://data-vocabulary.org/Review">
76
+ <span itemprop="itemreviewed">Romeo Pizza</span>
77
+ <span itemprop="itemreviewed">Some Other Pizza</span>
78
+ </div>
79
+ </body></html>
80
+ '
81
+
82
+ class Review < Mida::Vocabulary
83
+ itemtype %r{http://data-vocabulary.org/Review}
84
+ has_one 'item_reviewed'
85
+ end
86
+ end
87
+
88
+ context 'with validation on' do
89
+ before do
90
+ @md = Mida::Document.new(@html)
91
+ end
92
+
93
+ it 'should reject properties for items of known vocabularies that are not valid' do
94
+ @md.items[0].properties.should == {}
95
+ end
96
+ end
97
+
98
+ context 'with validation off' do
99
+ before do
100
+ @md = Mida::Document.new(@html, false)
101
+ end
102
+
103
+ it 'should accept properties for items of known vocabularies even if not valid' do
104
+ @md.items[0].properties['itemreviewed'].should ==
105
+ ['Romeo Pizza', 'Some Other Pizza']
106
+ end
107
+ end
108
+
109
+ after do
110
+ Mida::Vocabulary.unregister(Review)
111
+ end
112
+ end
113
+
81
114
  describe Mida::Document, 'when run against a full html document containing itemscopes with and without itemtypes' do
82
115
 
83
116
  before do
@@ -161,7 +194,6 @@ describe Mida::Document, 'when run against a full html document containing two n
161
194
  it 'should return all the properties and types with the correct values for 1st itemscope' do
162
195
  expected_results = [{
163
196
  type: 'http://data-vocabulary.org/Review',
164
- id: nil,
165
197
  properties: {
166
198
  'itemreviewed' => ['Romeo Pizza'],
167
199
  'rating' => ['4.5']
@@ -173,7 +205,6 @@ describe Mida::Document, 'when run against a full html document containing two n
173
205
  it 'should return all the properties from the text for 2nd itemscope' do
174
206
  expected_results = [{
175
207
  type: 'http://data-vocabulary.org/Organization',
176
- id: nil,
177
208
  properties: {
178
209
  'name' => ['An org name'],
179
210
  'url' => ['http://example.com']
@@ -209,7 +240,13 @@ describe Mida::Document, 'when run against a full html document containing one
209
240
  @md = Mida::Document.new(html)
210
241
  end
211
242
 
212
- it_should_behave_like 'one root itemscope'
243
+ it 'should not match itemscopes with different names' do
244
+ @md.search(%r{nothing}).size.should == 0
245
+ end
246
+
247
+ it 'should find the correct number of itemscopes' do
248
+ @md.items.size.should == 1
249
+ end
213
250
 
214
251
  it 'should return the correct number of itemscopes' do
215
252
  vocabularies = [
@@ -224,13 +261,11 @@ describe Mida::Document, 'when run against a full html document containing one
224
261
  it 'should return all the properties from the text with the correct values' do
225
262
  expected_results = [{
226
263
  type: 'http://data-vocabulary.org/Product',
227
- id: nil,
228
264
  properties: {
229
265
  'name' => ['DC07'],
230
266
  'brand' => ['Dyson'],
231
267
  'review' => [{
232
268
  type: 'http://data-vocabulary.org/Review-aggregate',
233
- id: nil,
234
269
  properties: {
235
270
  'count' => ['1'],
236
271
  'rating' => ['5.0']
@@ -287,14 +322,12 @@ describe Mida::Document, 'when run against a document containing an itemscope
287
322
  pending("get the contains: feature working")
288
323
  expected_result = {
289
324
  type: 'http://data-vocabulary.org/Product',
290
- id: nil,
291
325
  properties: {
292
326
  'name' => 'DC07',
293
327
  'brand' => 'Dyson'
294
328
  },
295
329
  contains: {
296
330
  type: 'http://data-vocabulary.org/Review-aggregate',
297
- id: nil,
298
331
  properties: {
299
332
  'count' => '1',
300
333
  'rating' => '5.0'
data/spec/item_spec.rb CHANGED
@@ -27,12 +27,8 @@ describe Mida::Item, 'when initialized with an incomplete itemscope' do
27
27
  @item.properties.should == {}
28
28
  end
29
29
 
30
- it '#to_h should return the correct type and properties' do
31
- @item.to_h.should == {
32
- type: nil,
33
- id: nil,
34
- properties: {}
35
- }
30
+ it '#to_h should return an empty hash' do
31
+ @item.to_h.should == {}
36
32
  end
37
33
  end
38
34
 
@@ -66,7 +62,7 @@ describe Mida::Item, 'when initialized with a complete itemscope of an unknown t
66
62
  }
67
63
  end
68
64
 
69
- it '#to_h should return the correct type and properties' do
65
+ it '#to_h should return the correct type, id and properties' do
70
66
  @item.to_h.should == {
71
67
  type: 'book',
72
68
  id: "urn:isbn:978-1-849510-50-9",
@@ -113,7 +109,7 @@ describe Mida::Item, 'when initialized with an itemscope of a known type' do
113
109
  @item.properties['url'].should == ['http://example.com/user/lorry']
114
110
  end
115
111
 
116
- it 'should reject datatypes that are not valid' do
112
+ it 'should accept datatypes that are valid' do
117
113
  @item.properties['date'][0].should == '2nd October 2009'
118
114
  end
119
115
 
@@ -121,10 +117,6 @@ describe Mida::Item, 'when initialized with an itemscope of a known type' do
121
117
  @item.properties['date'][1].should == Date.iso8601('2009-10-02')
122
118
  end
123
119
 
124
- it 'should reject datatypes that are not valid' do
125
- @item.properties['date'][1].should == Date.iso8601('2009-10-02')
126
- end
127
-
128
120
  it '#properties should return the same properties as the itemscope' do
129
121
  @item.properties.should == {
130
122
  'name' => 'Lorry Woodman',
@@ -136,7 +128,6 @@ describe Mida::Item, 'when initialized with an itemscope of a known type' do
136
128
  it '#to_h should return the correct type and properties' do
137
129
  @item.to_h.should == {
138
130
  type: 'http://example.com/vocab/person',
139
- id: nil,
140
131
  properties: {
141
132
  'name' => 'Lorry Woodman',
142
133
  'date' => ['2nd October 2009', Date.iso8601('2009-10-02')],
@@ -157,31 +148,58 @@ describe Mida::Item, 'when initialized with an itemscope of a known type that do
157
148
  itemtype %r{http://example.com/vocab/person}
158
149
  has_one 'name', 'tel'
159
150
  has_many 'url', 'city'
151
+ has_one 'dob' do
152
+ extract Mida::DataType::ISO8601Date
153
+ end
160
154
  end
161
155
 
162
- itemscope = mock(Mida::Itemscope)
163
- itemscope.stub!(:type).and_return("http://example.com/vocab/person")
164
- itemscope.stub!(:id).and_return(nil)
165
- itemscope.stub!(:properties).and_return(
156
+ @itemscope = mock(Mida::Itemscope)
157
+ @itemscope.stub!(:type).and_return("http://example.com/vocab/person")
158
+ @itemscope.stub!(:id).and_return(nil)
159
+ @itemscope.stub!(:properties).and_return(
166
160
  { 'name' => ['Lorry Woodman'],
167
161
  'tel' => ['000004847582', '111111857485'],
168
162
  'url' => ['http://example.com/user/lorry'],
169
- 'city' => ['Bristol']
163
+ 'city' => ['Bristol'],
164
+ 'dob' => 'When I was born'
170
165
  }
171
166
  )
172
- @item = Mida::Item.new(itemscope)
173
167
  end
174
168
 
175
- it '#vocabulary should return the correct vocabulary' do
176
- @item.vocabulary.should == Person
177
- end
169
+ context 'when validation selected' do
170
+ before do
171
+ @item = Mida::Item.new(@itemscope)
172
+ end
173
+
174
+ it '#vocabulary should return the correct vocabulary' do
175
+ @item.vocabulary.should == Person
176
+ end
177
+
178
+ it 'should not keep properties that have too many values' do
179
+ @item.properties.should_not have_key('tel')
180
+ end
178
181
 
179
- it 'should not keep properties that have too many values' do
180
- @item.properties.should_not have_key('tel')
182
+ it 'should not keep properties that have the wrong DataType' do
183
+ @item.properties.should_not have_key('dob')
184
+ end
181
185
  end
182
186
 
183
- it 'should keep have_many properties even if they have only one value' do
184
- @item.properties.should have_key('city')
187
+ context 'when validation not selected' do
188
+ before do
189
+ @item = Mida::Item.new(@itemscope, false)
190
+ end
191
+
192
+ it '#vocabulary should return the correct vocabulary' do
193
+ @item.vocabulary.should == Person
194
+ end
195
+
196
+ it 'should keep properties even if they have too many values' do
197
+ @item.properties.should have_key('tel')
198
+ end
199
+
200
+ it 'should keep properties even if they have the wrong DataType' do
201
+ @item.properties.should have_key('dob')
202
+ end
185
203
  end
186
204
 
187
205
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: mida
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.3.0
5
+ version: 0.3.1
6
6
  platform: ruby
7
7
  authors:
8
8
  - Lawrence Woodman
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-06-29 00:00:00 Z
13
+ date: 2011-07-05 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: nokogiri
@@ -20,7 +20,7 @@ dependencies:
20
20
  requirements:
21
21
  - - ">="
22
22
  - !ruby/object:Gem::Version
23
- version: "0"
23
+ version: "1.5"
24
24
  type: :runtime
25
25
  version_requirements: *id001
26
26
  - !ruby/object:Gem::Dependency
@@ -36,8 +36,8 @@ dependencies:
36
36
  version_requirements: *id002
37
37
  description: A Microdata parser and extractor library, based on the latest published version of the Microdata Specification, dated 5th April 2011.
38
38
  email: lwoodman@vlifesystems.com
39
- executables: []
40
-
39
+ executables:
40
+ - mida
41
41
  extensions: []
42
42
 
43
43
  extra_rdoc_files:
@@ -78,6 +78,7 @@ files:
78
78
  - README.rdoc
79
79
  - LICENSE.rdoc
80
80
  - Rakefile
81
+ - bin/mida
81
82
  homepage: http://lawrencewoodman.github.com/mida/
82
83
  licenses: []
83
84