mida 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,8 @@
1
+ == 0.3.1 (5th July 2011)
2
+ * Add bin/mida exectuable
3
+ * Changed <tt>Item#to_h</tt> to only return keys with values
4
+ * Add validate option to +Document+ and +Item+ to turn on/off validation
5
+
1
6
  == 0.3.0 (29th June 2011)
2
7
  * Merge +VocabularyDesc+ into +Vocabulary+
3
8
  * Vocabularies are now auto registered using +inherited+ hook
data/README.rdoc CHANGED
@@ -18,7 +18,20 @@ Mida keeps RubyGems[http://rubygems.org/gems/mida] up-to-date with its latest ve
18
18
 
19
19
  * +Nokogiri+
20
20
 
21
- == Usage
21
+ == Command Line Usage
22
+
23
+ To use the command line tool, supply it with the urls or filenames that you
24
+ would like to be parsed (by default each item is output as yaml):
25
+ mida http://lawrencewoodman.github.com/mida/news/
26
+
27
+ If you want to search for specific types you can use the <tt>-t</tt> switch
28
+ followed by a Regular Expression:
29
+ mida -t /person/i http://lawrencewoodman.github.com/mida/news/
30
+
31
+ For more information look at <tt>mida</tt>'s help:
32
+ mida -h
33
+
34
+ == Library Usage
22
35
  The following examples assume that you have required +mida+ and
23
36
  <tt>open-uri</tt>.
24
37
 
data/Rakefile CHANGED
@@ -1,24 +1,25 @@
1
1
  task :default => :spec
2
2
 
3
3
  desc "Create Gem"
4
- require 'rake/gempackagetask'
4
+ require 'rubygems/package_task'
5
5
  spec = Gem::Specification.new do |s|
6
6
  s.name = "mida"
7
7
  s.summary = "A Microdata parser/extractor library"
8
8
  s.description = "A Microdata parser and extractor library, based on the latest published version of the Microdata Specification, dated 5th April 2011."
9
- s.version = "0.3.0"
9
+ s.version = "0.3.1"
10
10
  s.author = "Lawrence Woodman"
11
11
  s.email = "lwoodman@vlifesystems.com"
12
12
  s.homepage = %q{http://lawrencewoodman.github.com/mida/}
13
13
  s.platform = Gem::Platform::RUBY
14
14
  s.required_ruby_version = '>=1.9'
15
15
  s.files = Dir['lib/**/*.rb'] + Dir['spec/**/*.rb'] + Dir['*.rdoc'] + Dir['Rakefile']
16
+ s.executables = ['mida']
16
17
  s.extra_rdoc_files = ['README.rdoc', 'LICENSE.rdoc', 'CHANGELOG.rdoc']
17
18
  s.rdoc_options << '--main' << 'README.rdoc'
18
- s.add_dependency('nokogiri')
19
+ s.add_dependency('nokogiri', '>= 1.5')
19
20
  s.add_development_dependency('rspec', '>= 2.0' )
20
21
  end
21
- Rake::GemPackageTask.new(spec).define
22
+ Gem::PackageTask.new(spec).define
22
23
 
23
24
  desc "Run Specs"
24
25
  require 'rspec/core/rake_task'
data/bin/mida ADDED
@@ -0,0 +1,110 @@
1
+ #!/usr/bin/env ruby
2
+ ## Mida: Microdata parser/extractor
3
+ ##
4
+ ## Usage: mida [options...] [sources...]
5
+ ##
6
+ ## Find the Microdata in the given 'sources', which can be urls or files.
7
+ ## Urls must be prefixed with: http://
8
+ ##
9
+
10
+ require 'open-uri'
11
+ require 'yaml'
12
+ require 'optparse'
13
+
14
+ # Displays comment at top of file
15
+ def banner
16
+ File.readlines(__FILE__).
17
+ grep(/^##.*/).
18
+ map { |line| line.chomp[3..-1] }.
19
+ join("\n")+"\n"+" Options:\n"
20
+ end
21
+
22
+ begin
23
+ require 'mida'
24
+ rescue LoadError
25
+ raise if $!.to_s !~ /mida/
26
+ libdir = File.expand_path("../../lib", __FILE__).sub(/^#{Dir.pwd}/, '.')
27
+ if !$:.include?(libdir)
28
+ warn "warn: #{$!.to_s}. trying again with #{libdir} on load path."
29
+ $:.unshift libdir
30
+ retry
31
+ end
32
+ raise
33
+ end
34
+
35
+ options = {sourcename: true, validate: true}
36
+ ARGV.options do |option|
37
+ option.banner = banner
38
+ option.on('-c','--count', 'Display the counts of each Microdata Type') do
39
+ options[:count] = true
40
+ end
41
+ option.on('-n','--no-sourcename', "Don't display the source name") do
42
+ options[:sourcename] = false
43
+ end
44
+ option.on('-t','--type TYPE', Regexp,
45
+ 'A regexp to match the itemtypes against') do |type|
46
+ options[:type] = type
47
+ end
48
+ option.on('-v','--no-validate',
49
+ "Don't validate the items against known Vocabularies") do
50
+ options[:validate] = false
51
+ end
52
+ option.on_tail('-h','--help', 'This help message') {puts option; exit}
53
+
54
+ begin
55
+ option.parse!
56
+ rescue OptionParser::InvalidOption => error
57
+ puts "#{error}\n#{option}"; exit
58
+ end
59
+ if ARGV.empty? then puts option; exit end
60
+ end
61
+
62
+ # Get the url from the source if there is one
63
+ def get_url
64
+ ARGV.first =~ %r{^http://.*} ? ARGV.first : nil
65
+ end
66
+
67
+ # Display each item as yaml
68
+ def display_items(items)
69
+ items.each {|item| puts item.to_h.to_yaml}
70
+ end
71
+
72
+ # Returns a hash {type => count}
73
+ def count_types(types)
74
+ types.each_with_object(Hash.new(0)) {|type,count| count[type] += 1}
75
+ end
76
+
77
+ # Display the number of each type of item
78
+ def display_count(items)
79
+ types = items.collect {|item| item.type}
80
+ count_types(types).each {|type, count| puts "Found #{count} #{type}"}
81
+ end
82
+
83
+ def parse_source(source, options)
84
+ url = get_url
85
+ begin
86
+ open(source) do |f|
87
+ doc = Mida::Document.new(f, url, options[:validate])
88
+ type = options[:type] || %r{}
89
+ items = doc.search(type)
90
+ if items.empty?
91
+ puts "No microdata found in this document."; exit
92
+ else
93
+ if options.include?(:count)
94
+ display_count(items)
95
+ else
96
+ display_items(items)
97
+ end
98
+ end
99
+ end
100
+ rescue
101
+ puts "Failed to parse: #{source}"
102
+ exit
103
+ end
104
+ end
105
+
106
+ ARGV.each do |source|
107
+ puts "Parsing: #{source}" if options[:sourcename]
108
+ parse_source(source, options)
109
+ puts
110
+ end
data/lib/mida/document.rb CHANGED
@@ -15,10 +15,11 @@ module Mida
15
15
  # [target] The string containing the html that you want to parse.
16
16
  # [page_url] The url of target used for form absolute urls. This must
17
17
  # include the filename, e.g. index.html.
18
- def initialize(target, page_url=nil)
18
+ # [validate] Whether to validate the items against known vocabularies.
19
+ def initialize(target, page_url=nil, validate=true)
19
20
  @doc = Nokogiri(target)
20
21
  @page_url = page_url
21
- @items = extract_items
22
+ @items = extract_items(validate)
22
23
  end
23
24
 
24
25
  # Implements method for Enumerable
@@ -45,13 +46,13 @@ module Mida
45
46
  end
46
47
 
47
48
  private
48
- def extract_items
49
+ def extract_items(validate)
49
50
  itemscopes = @doc.search('//*[@itemscope and not(@itemprop)]')
50
51
  return nil unless itemscopes
51
52
 
52
53
  itemscopes.collect do |itemscope|
53
54
  itemscope = Itemscope.new(itemscope, @page_url)
54
- Item.new(itemscope)
55
+ Item.new(itemscope, validate)
55
56
  end
56
57
  end
57
58
 
data/lib/mida/item.rb CHANGED
@@ -24,12 +24,13 @@ module Mida
24
24
  # its +properties+
25
25
  #
26
26
  # [itemscope] The itemscope that has been parsed by +Itemscope+
27
- def initialize(itemscope)
27
+ # [validate] Whether to validate the item against known vocabularies
28
+ def initialize(itemscope, validate=true)
28
29
  @type = itemscope.type
29
30
  @id = itemscope.id
30
31
  @vocabulary = Mida::Vocabulary.find(@type)
31
32
  @properties = itemscope.properties
32
- validate_properties
33
+ validate_properties if validate
33
34
  end
34
35
 
35
36
  # Return a Hash representation
@@ -39,7 +40,12 @@ module Mida
39
40
  # properties: {'a name' => 'avalue' }
40
41
  # }
41
42
  def to_h
42
- {type: @type, id: @id, properties: properties_to_h(@properties)}
43
+ # Only fill hash with non-nil values
44
+ hash = {}
45
+ @type and hash[:type] = @type
46
+ @id and hash[:id] = @id
47
+ @properties.any? and hash[:properties] = properties_to_h(@properties)
48
+ hash
43
49
  end
44
50
 
45
51
  def to_s
@@ -58,7 +64,11 @@ module Mida
58
64
  @properties =
59
65
  @properties.each_with_object({}) do |(property, values), hash|
60
66
  valid_values = validate_values(property, values)
61
- hash[property] = valid_values unless valid_values.nil?
67
+ if valid_values.respond_to?(:any?)
68
+ hash[property] = valid_values if valid_values.any?
69
+ else
70
+ hash[property] = valid_values
71
+ end
62
72
  end
63
73
  end
64
74
 
@@ -77,9 +87,9 @@ module Mida
77
87
  # Return valid values, converted to the correct +DataType+
78
88
  # or +Item+ and number if necessary
79
89
  def validate_values(property, values)
80
- return nil unless valid_property?(property, values)
90
+ return [] unless valid_property?(property, values)
81
91
  prop_num = property_number(property)
82
- return nil unless valid_num_values?(prop_num, values)
92
+ return [] unless valid_num_values?(prop_num, values)
83
93
  prop_types = property_types(property)
84
94
 
85
95
  valid_values = values.each_with_object([]) do |value, valid_values|
@@ -30,16 +30,6 @@ def match_array(value_array, expected_results)
30
30
  end
31
31
  end
32
32
 
33
- shared_examples_for 'one root itemscope' do
34
- it 'should not match itemscopes with different names' do
35
- @md.search(%r{nothing}).size.should == 0
36
- end
37
-
38
- it 'should find the correct number of itemscopes' do
39
- @md.items.size.should == 1
40
- end
41
- end
42
-
43
33
  describe Mida::Document do
44
34
  before do
45
35
  html = '
@@ -78,6 +68,49 @@ describe Mida::Document do
78
68
  end
79
69
  end
80
70
 
71
+ describe Mida::Document, 'when initialized' do
72
+ before do
73
+ @html = '
74
+ <html><body>
75
+ <div itemscope itemtype="http://data-vocabulary.org/Review">
76
+ <span itemprop="itemreviewed">Romeo Pizza</span>
77
+ <span itemprop="itemreviewed">Some Other Pizza</span>
78
+ </div>
79
+ </body></html>
80
+ '
81
+
82
+ class Review < Mida::Vocabulary
83
+ itemtype %r{http://data-vocabulary.org/Review}
84
+ has_one 'item_reviewed'
85
+ end
86
+ end
87
+
88
+ context 'with validation on' do
89
+ before do
90
+ @md = Mida::Document.new(@html)
91
+ end
92
+
93
+ it 'should reject properties for items of known vocabularies that are not valid' do
94
+ @md.items[0].properties.should == {}
95
+ end
96
+ end
97
+
98
+ context 'with validation off' do
99
+ before do
100
+ @md = Mida::Document.new(@html, false)
101
+ end
102
+
103
+ it 'should accept properties for items of known vocabularies even if not valid' do
104
+ @md.items[0].properties['itemreviewed'].should ==
105
+ ['Romeo Pizza', 'Some Other Pizza']
106
+ end
107
+ end
108
+
109
+ after do
110
+ Mida::Vocabulary.unregister(Review)
111
+ end
112
+ end
113
+
81
114
  describe Mida::Document, 'when run against a full html document containing itemscopes with and without itemtypes' do
82
115
 
83
116
  before do
@@ -161,7 +194,6 @@ describe Mida::Document, 'when run against a full html document containing two n
161
194
  it 'should return all the properties and types with the correct values for 1st itemscope' do
162
195
  expected_results = [{
163
196
  type: 'http://data-vocabulary.org/Review',
164
- id: nil,
165
197
  properties: {
166
198
  'itemreviewed' => ['Romeo Pizza'],
167
199
  'rating' => ['4.5']
@@ -173,7 +205,6 @@ describe Mida::Document, 'when run against a full html document containing two n
173
205
  it 'should return all the properties from the text for 2nd itemscope' do
174
206
  expected_results = [{
175
207
  type: 'http://data-vocabulary.org/Organization',
176
- id: nil,
177
208
  properties: {
178
209
  'name' => ['An org name'],
179
210
  'url' => ['http://example.com']
@@ -209,7 +240,13 @@ describe Mida::Document, 'when run against a full html document containing one
209
240
  @md = Mida::Document.new(html)
210
241
  end
211
242
 
212
- it_should_behave_like 'one root itemscope'
243
+ it 'should not match itemscopes with different names' do
244
+ @md.search(%r{nothing}).size.should == 0
245
+ end
246
+
247
+ it 'should find the correct number of itemscopes' do
248
+ @md.items.size.should == 1
249
+ end
213
250
 
214
251
  it 'should return the correct number of itemscopes' do
215
252
  vocabularies = [
@@ -224,13 +261,11 @@ describe Mida::Document, 'when run against a full html document containing one
224
261
  it 'should return all the properties from the text with the correct values' do
225
262
  expected_results = [{
226
263
  type: 'http://data-vocabulary.org/Product',
227
- id: nil,
228
264
  properties: {
229
265
  'name' => ['DC07'],
230
266
  'brand' => ['Dyson'],
231
267
  'review' => [{
232
268
  type: 'http://data-vocabulary.org/Review-aggregate',
233
- id: nil,
234
269
  properties: {
235
270
  'count' => ['1'],
236
271
  'rating' => ['5.0']
@@ -287,14 +322,12 @@ describe Mida::Document, 'when run against a document containing an itemscope
287
322
  pending("get the contains: feature working")
288
323
  expected_result = {
289
324
  type: 'http://data-vocabulary.org/Product',
290
- id: nil,
291
325
  properties: {
292
326
  'name' => 'DC07',
293
327
  'brand' => 'Dyson'
294
328
  },
295
329
  contains: {
296
330
  type: 'http://data-vocabulary.org/Review-aggregate',
297
- id: nil,
298
331
  properties: {
299
332
  'count' => '1',
300
333
  'rating' => '5.0'
data/spec/item_spec.rb CHANGED
@@ -27,12 +27,8 @@ describe Mida::Item, 'when initialized with an incomplete itemscope' do
27
27
  @item.properties.should == {}
28
28
  end
29
29
 
30
- it '#to_h should return the correct type and properties' do
31
- @item.to_h.should == {
32
- type: nil,
33
- id: nil,
34
- properties: {}
35
- }
30
+ it '#to_h should return an empty hash' do
31
+ @item.to_h.should == {}
36
32
  end
37
33
  end
38
34
 
@@ -66,7 +62,7 @@ describe Mida::Item, 'when initialized with a complete itemscope of an unknown t
66
62
  }
67
63
  end
68
64
 
69
- it '#to_h should return the correct type and properties' do
65
+ it '#to_h should return the correct type, id and properties' do
70
66
  @item.to_h.should == {
71
67
  type: 'book',
72
68
  id: "urn:isbn:978-1-849510-50-9",
@@ -113,7 +109,7 @@ describe Mida::Item, 'when initialized with an itemscope of a known type' do
113
109
  @item.properties['url'].should == ['http://example.com/user/lorry']
114
110
  end
115
111
 
116
- it 'should reject datatypes that are not valid' do
112
+ it 'should accept datatypes that are valid' do
117
113
  @item.properties['date'][0].should == '2nd October 2009'
118
114
  end
119
115
 
@@ -121,10 +117,6 @@ describe Mida::Item, 'when initialized with an itemscope of a known type' do
121
117
  @item.properties['date'][1].should == Date.iso8601('2009-10-02')
122
118
  end
123
119
 
124
- it 'should reject datatypes that are not valid' do
125
- @item.properties['date'][1].should == Date.iso8601('2009-10-02')
126
- end
127
-
128
120
  it '#properties should return the same properties as the itemscope' do
129
121
  @item.properties.should == {
130
122
  'name' => 'Lorry Woodman',
@@ -136,7 +128,6 @@ describe Mida::Item, 'when initialized with an itemscope of a known type' do
136
128
  it '#to_h should return the correct type and properties' do
137
129
  @item.to_h.should == {
138
130
  type: 'http://example.com/vocab/person',
139
- id: nil,
140
131
  properties: {
141
132
  'name' => 'Lorry Woodman',
142
133
  'date' => ['2nd October 2009', Date.iso8601('2009-10-02')],
@@ -157,31 +148,58 @@ describe Mida::Item, 'when initialized with an itemscope of a known type that do
157
148
  itemtype %r{http://example.com/vocab/person}
158
149
  has_one 'name', 'tel'
159
150
  has_many 'url', 'city'
151
+ has_one 'dob' do
152
+ extract Mida::DataType::ISO8601Date
153
+ end
160
154
  end
161
155
 
162
- itemscope = mock(Mida::Itemscope)
163
- itemscope.stub!(:type).and_return("http://example.com/vocab/person")
164
- itemscope.stub!(:id).and_return(nil)
165
- itemscope.stub!(:properties).and_return(
156
+ @itemscope = mock(Mida::Itemscope)
157
+ @itemscope.stub!(:type).and_return("http://example.com/vocab/person")
158
+ @itemscope.stub!(:id).and_return(nil)
159
+ @itemscope.stub!(:properties).and_return(
166
160
  { 'name' => ['Lorry Woodman'],
167
161
  'tel' => ['000004847582', '111111857485'],
168
162
  'url' => ['http://example.com/user/lorry'],
169
- 'city' => ['Bristol']
163
+ 'city' => ['Bristol'],
164
+ 'dob' => 'When I was born'
170
165
  }
171
166
  )
172
- @item = Mida::Item.new(itemscope)
173
167
  end
174
168
 
175
- it '#vocabulary should return the correct vocabulary' do
176
- @item.vocabulary.should == Person
177
- end
169
+ context 'when validation selected' do
170
+ before do
171
+ @item = Mida::Item.new(@itemscope)
172
+ end
173
+
174
+ it '#vocabulary should return the correct vocabulary' do
175
+ @item.vocabulary.should == Person
176
+ end
177
+
178
+ it 'should not keep properties that have too many values' do
179
+ @item.properties.should_not have_key('tel')
180
+ end
178
181
 
179
- it 'should not keep properties that have too many values' do
180
- @item.properties.should_not have_key('tel')
182
+ it 'should not keep properties that have the wrong DataType' do
183
+ @item.properties.should_not have_key('dob')
184
+ end
181
185
  end
182
186
 
183
- it 'should keep have_many properties even if they have only one value' do
184
- @item.properties.should have_key('city')
187
+ context 'when validation not selected' do
188
+ before do
189
+ @item = Mida::Item.new(@itemscope, false)
190
+ end
191
+
192
+ it '#vocabulary should return the correct vocabulary' do
193
+ @item.vocabulary.should == Person
194
+ end
195
+
196
+ it 'should keep properties even if they have too many values' do
197
+ @item.properties.should have_key('tel')
198
+ end
199
+
200
+ it 'should keep properties even if they have the wrong DataType' do
201
+ @item.properties.should have_key('dob')
202
+ end
185
203
  end
186
204
 
187
205
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: mida
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.3.0
5
+ version: 0.3.1
6
6
  platform: ruby
7
7
  authors:
8
8
  - Lawrence Woodman
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-06-29 00:00:00 Z
13
+ date: 2011-07-05 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: nokogiri
@@ -20,7 +20,7 @@ dependencies:
20
20
  requirements:
21
21
  - - ">="
22
22
  - !ruby/object:Gem::Version
23
- version: "0"
23
+ version: "1.5"
24
24
  type: :runtime
25
25
  version_requirements: *id001
26
26
  - !ruby/object:Gem::Dependency
@@ -36,8 +36,8 @@ dependencies:
36
36
  version_requirements: *id002
37
37
  description: A Microdata parser and extractor library, based on the latest published version of the Microdata Specification, dated 5th April 2011.
38
38
  email: lwoodman@vlifesystems.com
39
- executables: []
40
-
39
+ executables:
40
+ - mida
41
41
  extensions: []
42
42
 
43
43
  extra_rdoc_files:
@@ -78,6 +78,7 @@ files:
78
78
  - README.rdoc
79
79
  - LICENSE.rdoc
80
80
  - Rakefile
81
+ - bin/mida
81
82
  homepage: http://lawrencewoodman.github.com/mida/
82
83
  licenses: []
83
84