prism 0.1.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of prism might be problematic. Click here for more details.

Files changed (71) hide show
  1. data/.gitignore +3 -0
  2. data/Gemfile +11 -0
  3. data/Gemfile.lock +51 -0
  4. data/LICENSE +20 -0
  5. data/README.md +137 -0
  6. data/Rakefile +53 -0
  7. data/bin/prism +99 -0
  8. data/lib/prism.rb +130 -0
  9. data/lib/prism/microformat.rb +49 -0
  10. data/lib/prism/microformat/adr.rb +22 -0
  11. data/lib/prism/microformat/geo.rb +48 -0
  12. data/lib/prism/microformat/hcard.rb +182 -0
  13. data/lib/prism/microformat/rellicense.rb +20 -0
  14. data/lib/prism/microformat/reltag.rb +38 -0
  15. data/lib/prism/microformat/votelinks.rb +42 -0
  16. data/lib/prism/microformat/xfn.rb +54 -0
  17. data/lib/prism/microformat/xmdp.rb +14 -0
  18. data/lib/prism/microformat/xoxo.rb +69 -0
  19. data/lib/prism/pattern.rb +26 -0
  20. data/lib/prism/pattern/abbr.rb +21 -0
  21. data/lib/prism/pattern/datetime.rb +75 -0
  22. data/lib/prism/pattern/typevalue.rb +32 -0
  23. data/lib/prism/pattern/url.rb +32 -0
  24. data/lib/prism/pattern/valueclass.rb +51 -0
  25. data/lib/prism/posh.rb +3 -0
  26. data/lib/prism/posh/anchor.rb +40 -0
  27. data/lib/prism/posh/base.rb +204 -0
  28. data/lib/prism/posh/definition_list.rb +41 -0
  29. data/prism.gemspec +132 -0
  30. data/test/fixtures/hcard/commercenet.html +21 -0
  31. data/test/fixtures/hcard/geo.html +28 -0
  32. data/test/fixtures/huffduffer.html +466 -0
  33. data/test/fixtures/likeorhate.html +48 -0
  34. data/test/fixtures/rel_license.html +4 -0
  35. data/test/fixtures/test-fixture/hcard/hcard1.html +147 -0
  36. data/test/fixtures/test-fixture/hcard/hcard11.html +123 -0
  37. data/test/fixtures/test-fixture/hcard/hcard12.html +178 -0
  38. data/test/fixtures/test-fixture/hcard/hcard17.html +165 -0
  39. data/test/fixtures/test-fixture/hcard/hcard2.html +264 -0
  40. data/test/fixtures/test-fixture/hcard/hcard3.html +144 -0
  41. data/test/fixtures/test-fixture/hcard/hcard4.html +117 -0
  42. data/test/fixtures/test-fixture/hcard/hcard5.html +119 -0
  43. data/test/fixtures/test-fixture/hcard/hcard6.html +188 -0
  44. data/test/fixtures/test-fixture/hcard/hcard7.html +188 -0
  45. data/test/fixtures/test-fixture/hcard/hcard8.html +130 -0
  46. data/test/fixtures/test-fixture/hcard/hcard9.html +111 -0
  47. data/test/fixtures/test-fixture/hcard/hcard99.html +215 -0
  48. data/test/fixtures/test-fixture/value-class-date-time/value-dt-test-YYYY-MM-DD--HH-MM.html +9 -0
  49. data/test/fixtures/test-fixture/value-class-date-time/value-dt-test-abbr-YYYY-MM-DD--HH-MM.html +4 -0
  50. data/test/fixtures/xfn.html +198 -0
  51. data/test/fixtures/xmdp.html +32 -0
  52. data/test/fixtures/xoxo.html +51 -0
  53. data/test/microformat/adr_test.rb +47 -0
  54. data/test/microformat/geo_test.rb +66 -0
  55. data/test/microformat/hcard_test.rb +510 -0
  56. data/test/microformat/rellicense_test.rb +36 -0
  57. data/test/microformat/reltag_test.rb +61 -0
  58. data/test/microformat/votelinks_test.rb +44 -0
  59. data/test/microformat/xfn_test.rb +28 -0
  60. data/test/microformat/xmdp_test.rb +16 -0
  61. data/test/microformat/xoxo_test.rb +51 -0
  62. data/test/microformat_test.rb +20 -0
  63. data/test/pattern/date_time_test.rb +55 -0
  64. data/test/pattern/value_class_test.rb +33 -0
  65. data/test/pattern_test.rb +132 -0
  66. data/test/posh/anchor_test.rb +41 -0
  67. data/test/posh/base_test.rb +150 -0
  68. data/test/posh/definition_list_test.rb +38 -0
  69. data/test/prism_test.rb +133 -0
  70. data/test/test_helper.rb +32 -0
  71. metadata +161 -0
@@ -0,0 +1,3 @@
1
+ pkg/
2
+ doc/
3
+ .bundle/
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source :gemcutter
2
+
3
+ gem 'nokogiri'
4
+
5
+ group :test do
6
+ gem 'rake'
7
+ gem 'contest'
8
+ gem 'redgreen'
9
+ gem 'fakeweb'
10
+ gem 'jeweler'
11
+ end
@@ -0,0 +1,51 @@
1
+ ---
2
+ dependencies:
3
+ fakeweb:
4
+ group:
5
+ - :test
6
+ version: ">= 0"
7
+ rake:
8
+ group:
9
+ - :test
10
+ version: ">= 0"
11
+ contest:
12
+ group:
13
+ - :test
14
+ version: ">= 0"
15
+ jeweler:
16
+ group:
17
+ - :test
18
+ version: ">= 0"
19
+ redgreen:
20
+ group:
21
+ - :test
22
+ version: ">= 0"
23
+ nokogiri:
24
+ group:
25
+ - :default
26
+ version: ">= 0"
27
+ specs:
28
+ - nokogiri:
29
+ version: 1.4.1
30
+ - json_pure:
31
+ version: 1.2.0
32
+ - git:
33
+ version: 1.2.5
34
+ - redgreen:
35
+ version: 1.2.2
36
+ - rubyforge:
37
+ version: 2.0.3
38
+ - rake:
39
+ version: 0.8.7
40
+ - gemcutter:
41
+ version: 0.3.0
42
+ - jeweler:
43
+ version: 1.4.0
44
+ - contest:
45
+ version: 0.1.2
46
+ - fakeweb:
47
+ version: 1.2.8
48
+ hash: a411a98d29121a4b1d05b9fbe457c6e068325a09
49
+ sources:
50
+ - Rubygems:
51
+ uri: http://gemcutter.org
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Mark Wunsch
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,137 @@
1
+ # Prism
2
+
3
+ **Ruby microformat parser and HTML toolkit**
4
+
5
+ _Formerly known as hMachine_
6
+
7
+ [RDoc](http://rdoc.info/projects/mwunsch/prism) | [Gem](http://rubygems.org/gems/prism) | [Metrics](http://getcaliper.com/caliper/project?repo=git%3A%2F%2Fgithub.com%2Fmwunsch%2Fprism.git)
8
+
9
+ ## What Prism is:
10
+
11
+ + A robust microformat parser
12
+ + A command-line tool for parsing microformats from a url or a string of markup
13
+ + A DSL for defining semantic markup patterns
14
+ + Export microformats to other standards:
15
+ + hCard => vCard
16
+
17
+ It is your [lowercase semantic web](http://tantek.com/presentations/2004etech/realworldsemanticspres.html) friend.
18
+
19
+ >Designed for humans first and machines second, microformats are a set of simple, open data formats built upon existing and widely adopted standards. Instead of throwing away what works today, microformats intend to solve simpler problems first by adapting to current behaviors and usage patterns (e.g. XHTML, blogging).
20
+
21
+ Learn more about Microformats at http://microformats.org.
22
+
23
+ ## Usage
24
+
25
+ The command line tool takes a SOURCE from the Standard Input or as an argument:
26
+
27
+ $: curl http://markwunsch.com | prism --hcard > ~/Desktop/me.vcf
28
+
29
+ OR
30
+
31
+ $: prism --hcard http://markwunsch.com > ~/Desktop/me.vcf
32
+
33
+ ## Installation
34
+
35
+ With Ruby and Rubygems:
36
+
37
+ gem install prism
38
+
39
+ Or clone the repository and run `bundle install` to get the development dependencies.
40
+
41
+ #### Requirements:
42
+
43
+ + [Nokogiri](http://github.com/tenderlove/nokogiri)
44
+
45
+ ## Microformats supported (right now, as of this very moment)
46
+
47
+ + [rel-tag](http://microformats.org/wiki/rel-tag)
48
+ + [rel-license](http://microformats.org/wiki/rel-license)
49
+ + [VoteLinks](http://microformats.org/wiki/vote-links)
50
+ + [XFN](http://microformats.org/wiki/XFN)
51
+ + [XOXO](http://microformats.org/wiki/xoxo)
52
+ + [XMDP](http://microformats.org/wiki/XMDP)
53
+ + [geo](http://microformats.org/wiki/geo)
54
+ + [adr](http://microformats.org/wiki/adr)
55
+ + [hCard](http://microformats.org/wiki/hcard)
56
+
57
+ More on the way.
58
+
59
+ ## Finding Microformats:
60
+
61
+ # All microformats
62
+ Prism.find 'http://foobar.com'
63
+
64
+ # A specific microformat
65
+ Prism.find 'http://foobar.com', :hcard
66
+
67
+ # Search HTML too
68
+ Prism.find big_string_of_html
69
+
70
+ ### Parsing Microformats:
71
+
72
+ twitter_contacts = Prism.find 'http://twitter.com/markwunsch', :hcard
73
+ me = twitter_contacts.first
74
+ me.fn
75
+ #=> "Mark Wunsch"
76
+ me.n.family_name
77
+ #=> "Wunsch"
78
+ me.url
79
+ #=> "http://markwunsch.com/"
80
+ File.open('mark.vcf','w') {|f| f.write me.to_vcard }
81
+ ## Add me to your address book!
82
+
83
+ ## POSH DSL
84
+
85
+ The `Prism` module defines a group of methods to search, validate, and extract nodes out of a Nokogiri document.
86
+
87
+ All microformats inherit from `Prism::POSH::Base`, because all microformats begin as [POSH formats](http://microformats.org/wiki/posh). If you wanted to create your own POSH format, you'd do something like this:
88
+
89
+ class Navigation < Prism::POSH::Base
90
+ search {|document| document.css('ul#navigation') }
91
+ # Search a Nokogiri document for nodes of a certain type
92
+
93
+ validate {|node| node.matches?('ul#navigation') }
94
+ # Validate that a node is the right element we want
95
+
96
+ has_many :items do
97
+ search {|doc| doc.css('li') }
98
+ end
99
+ # has_many and has_one define Properties of the POSH format (Prism::Property)
100
+ # Each Property object includes the Prism module, so they can search, validate, and extract
101
+ end
102
+
103
+ Now you can do:
104
+
105
+ nav = Navigation.parse_first(document)
106
+ # document is a Nokogiri document.
107
+ # parse_first extracts just the first example of the format out of the document
108
+
109
+ nav.items
110
+ # Returns an array of contents
111
+ # This method comes from the has_many call up above that defines the Property
112
+
113
+ ## Other Microformat parsers
114
+
115
+ + [Mofo](http://mofo.rubyforge.org/) is a Ruby microformat parser backed by Hpricot.
116
+ + [Sumo](http://www.danwebb.net/2007/2/9/sumo-a-generic-microformats-parser-for-javascript) is a JavaScript microformat parser.
117
+ + [Operator](https://addons.mozilla.org/en-US/firefox/addon/4106) is a Firefox extension.
118
+ + [hKit](http://code.google.com/p/hkit/) is a microformat parser for PHP.
119
+ + [Oomph](http://visitmix.com/labs/oomph/) is a microformat toolkit add-in for Internet Explorer.
120
+
121
+ ## Feature wishlist:
122
+
123
+ + HTML outliner (using HTML5 sectioning)
124
+ + Extensions so you can do something like: `String.is_a_valid? :hcard` in your tests
125
+ + Extensions to turn Ruby objects into semantic HTML. Hash.to_definition_list, Array.to_ordered_list, etc.
126
+
127
+ ## TODO:
128
+
129
+ + Handle nested microformats better (I like Prism::Pattern::ValueClass's search implementation the best)
130
+ + Code is ugly. Especially XOXO.
131
+ + Better recursive parsing of trees. See above.
132
+ + Tests are all kinds of disorganized.
133
+ + Broader support for some of the weirder Patterns, like object[data]
134
+
135
+ ## License
136
+
137
+ Prism is licensed under the [MIT License](http://creativecommons.org/licenses/MIT/) and is Copyright (c) 2010 Mark Wunsch.
@@ -0,0 +1,53 @@
1
+ begin
2
+ # Try to require the preresolved locked set of gems.
3
+ require File.expand_path('../.bundle/environment', __FILE__)
4
+ rescue LoadError
5
+ # Fall back on doing an unlocked resolve at runtime.
6
+ require "rubygems"
7
+ require "bundler"
8
+ Bundler.setup
9
+ end
10
+
11
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), 'lib')
12
+ require 'prism'
13
+ require 'rake'
14
+
15
+ task :default => :test
16
+
17
+ require 'rake/testtask'
18
+ Rake::TestTask.new do |t|
19
+ t.libs << "test"
20
+ t.pattern = 'test/**/*_test.rb'
21
+ t.verbose = false
22
+ end
23
+
24
+ begin
25
+ require 'jeweler'
26
+ Jeweler::Tasks.new do |gemspec|
27
+ gemspec.name = "prism"
28
+ gemspec.summary = "Ruby microformat parser and HTML toolkit"
29
+ gemspec.description = "A Ruby microformat parser and HTML toolkit powered by Nokogiri"
30
+ gemspec.version = Prism::VERSION
31
+ gemspec.homepage = "http://github.com/mwunsch/prism"
32
+ gemspec.authors = ["Mark Wunsch"]
33
+ gemspec.email = ["mark@markwunsch.com"]
34
+ gemspec.add_dependency 'nokogiri'
35
+ end
36
+ Jeweler::GemcutterTasks.new
37
+ rescue LoadError
38
+ puts "Jeweler not available. Install it with: gem install jeweler"
39
+ end
40
+
41
+ require 'rake/rdoctask'
42
+ Rake::RDocTask.new do |rdoc|
43
+ rdoc.rdoc_dir = 'doc'
44
+ rdoc.title = 'Prism'
45
+ rdoc.main = 'README.md'
46
+ rdoc.rdoc_files.include('README.*', 'lib/**/*.rb', 'LICENSE')
47
+ rdoc.options << '--inline-source'
48
+ end
49
+
50
+ desc "Open an irb session preloaded with this library"
51
+ task :console do
52
+ sh "irb -rubygems -I lib -r prism"
53
+ end
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env ruby
2
+ ## Prism: Microformat parser and HTML toolkit.
3
+ ##
4
+ ## Usage: prism [ OPTION ] [ SOURCE ]
5
+ ##
6
+ ## Find the Microformats in the given SOURCE. SOURCE can be a URL
7
+ ## or a string of HTML. If no Microformat is specified in UFORMAT,
8
+ ## prism will just print a list of what has been found.
9
+ ##
10
+ ## If a Microformat is specified, Prism will convert it into a
11
+ ## more suitable format. eg. hCard becomes a vCard.
12
+ ##
13
+ ## If no SOURCE is given, prism will read from the Standard Input.
14
+ ##
15
+ ## Microformats:
16
+ ## --vcard, --hcard hCard => vCard converter
17
+ ## --xfn Get XFN URLs
18
+ ##
19
+ ## Other Options:
20
+ ## -h, --help show this help message
21
+ ## -v, --version version of Prism
22
+ ##
23
+ ## Learn more about Microformats at http://microformats.org
24
+ ##
25
+ ##
26
+
27
+ require 'optparse'
28
+
29
+ def usage
30
+ File.readlines(__FILE__).
31
+ grep(/^##.*/).
32
+ map { |line| line.chomp[3..-1] }.
33
+ join("\n")
34
+ end
35
+
36
+ begin
37
+ require 'prism'
38
+ rescue LoadError
39
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
40
+ require 'prism'
41
+ end
42
+
43
+ options = {}
44
+ ARGV.options do |option|
45
+ option.banner = "Hi."
46
+ option.on('--hcard','--vcard') { options[:vcard] = true }
47
+ option.on('--xfn') { options[:xfn] = true }
48
+ option.on_tail('-h','--help') { puts usage ; exit }
49
+ option.on_tail('-v','--version') { puts Prism::VERSION ; exit }
50
+ option.parse!
51
+ end
52
+
53
+ if ARGV.empty? && STDIN.tty?
54
+ puts usage
55
+ exit
56
+ end
57
+
58
+ def uformat_counts(group)
59
+ uf_count = Prism::Microformat.microformats.values.collect do |uformat|
60
+ name = "#{uformat::FRIENDLY_NAME}"
61
+ found = group.select {|format| format.is_a?(uformat) }.count
62
+ if found > 0
63
+ name += "s" if found > 1
64
+ "Found #{found} #{name} in the document. Read more at: #{uformat::WIKI_URL}"
65
+ end
66
+ end.compact
67
+ if !uf_count.empty?
68
+ uf_count
69
+ else
70
+ puts "No microformats found in this document." ; exit
71
+ end
72
+ end
73
+
74
+ def parse_microformats(doc, type)
75
+ uformats = Prism.find(doc, type)
76
+ if uformats
77
+ if uformats.respond_to?(:length)
78
+ uformats.each {|uf| yield uf if block_given? }
79
+ else
80
+ yield uformats if block_given?
81
+ end
82
+ else
83
+ puts "No #{type}s found in this document."
84
+ end
85
+ end
86
+
87
+ def input
88
+ ARGV.first ? ARGV.first : STDIN.read
89
+ end
90
+
91
+ if options.empty?
92
+ uformat_counts(Prism.find(input)).each {|count| puts count }
93
+ else
94
+ if options[:vcard]
95
+ parse_microformats(input, :hcard) {|hcard| puts hcard.to_vcard }
96
+ elsif options[:xfn]
97
+ parse_microformats(input, :xfn) {|xfn| puts xfn.url }
98
+ end
99
+ end
@@ -0,0 +1,130 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+
4
+ module Prism
5
+ VERSION = "0.1.0"
6
+ PRODID = "-//markwunsch.com//Prism #{VERSION}//EN"
7
+
8
+ # Convenience method for Prism::Microformat.find method
9
+ def self.find(document, format=nil)
10
+ Prism::Microformat.find(document, format)
11
+ end
12
+
13
+ # Get a string of html or a url and convert it to a Nokogiri Document
14
+ def self.get(html)
15
+ return html if html.is_a?(Nokogiri::XML::Node)
16
+ begin
17
+ url = URI.parse(html)
18
+ doc = url.is_a?(URI::HTTP) ? get_url(url.normalize.to_s) : get_document(html)
19
+ rescue URI::InvalidURIError
20
+ doc = get_document(html)
21
+ end
22
+ doc
23
+ end
24
+
25
+ # Open a URL and convert the contents to a Nokogiri Document
26
+ def self.get_url(url)
27
+ uri = URI.parse(url)
28
+ doc = ''
29
+ uri.open do |web|
30
+ web.each_line {|line| doc += line }
31
+ end
32
+ get_document(doc, url)
33
+ end
34
+
35
+ # Convert HTML to a Nokogiri Document
36
+ def self.get_document(html, url=nil)
37
+ html.is_a?(Nokogiri::XML::Node) ? html : Nokogiri::HTML.parse(html, url)
38
+ end
39
+
40
+ def self.normalize(name)
41
+ name.to_s.strip.downcase.intern
42
+ end
43
+
44
+ # Map a key to an element or design pattern
45
+ def self.map(key)
46
+ case normalize(key)
47
+ when :value_class, :valueclass, :abbr, :uri, :url, :typevalue
48
+ Prism::Pattern.map(key)
49
+ when :hcard, :geo, :rellicense, :reltag, :votelinks, :xfn, :xmdp, :xoxo, :adr
50
+ Prism::Microformat.map(key)
51
+ when :base
52
+ Prism::POSH::Base
53
+ else
54
+ raise "#{key} is not a recognized parser."
55
+ end
56
+ end
57
+
58
+ # Get/Set a function that defines how to find an element in a node.
59
+ # The Search function should return a Nokogiri::XML::NodeSet.
60
+ # eg. <tt>search {|node| node.css(element) }
61
+ def search(&block)
62
+ @search = block if block_given?
63
+ @search || lambda {|node| node }
64
+ end
65
+
66
+ # Search for the element in a document
67
+ def find_in(document)
68
+ search.call(document)
69
+ end
70
+
71
+ # Is the element found in node?
72
+ def found_in?(node)
73
+ find_in(node).eql?(node) || !find_in(node).empty?
74
+ end
75
+
76
+ # Get/Set a function that tests to make sure a given node is
77
+ # the element we want. Should return truthy.
78
+ # Default just tests to see if the node passed is a child of its parent node.
79
+ def validate(&block)
80
+ @validate = block if block_given?
81
+ @validate || lambda { |node| find_in(node.parent).children.include?(node) }
82
+ end
83
+
84
+ # Is this a valid node?
85
+ def valid?(node)
86
+ validate.call(node)
87
+ end
88
+
89
+ # Define the pattern used to extract contents from node
90
+ # Can be a symbols that match to an Element parser, or a block
91
+ def extract(pattern = nil, &block)
92
+ if block_given?
93
+ @extract = block
94
+ else
95
+ @extract = Prism.map(pattern).extract if pattern
96
+ end
97
+ @extract || lambda{|node| node.content.strip }
98
+ end
99
+
100
+ # Extract the content from the node
101
+ def extract_from(node)
102
+ extract.call(node)
103
+ end
104
+
105
+ # Parse the document, finding every instance of the desired element, and extract their contents
106
+ def parse(document)
107
+ if found_in?(document)
108
+ contents = if find_in(document).respond_to?(:collect)
109
+ find_in(document).collect { |element| extract_from(element) }
110
+ else
111
+ extract_from(document)
112
+ end
113
+ return contents.first if contents.respond_to?(:length) && (contents.length == 1)
114
+ contents
115
+ end
116
+ end
117
+
118
+ # Parse the document, extracting the content for the first instance of the element
119
+ def parse_first(document)
120
+ if found_in?(document)
121
+ elements = find_in(document)
122
+ extract_from elements.respond_to?(:first) ? elements.first : elements
123
+ end
124
+ end
125
+
126
+ end
127
+
128
+ require 'prism/pattern'
129
+ require 'prism/posh'
130
+ require 'prism/microformat'