craft 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +26 -18
- data/Rakefile +8 -1
- data/craft.gemspec +10 -7
- data/lib/craft.rb +89 -3
- data/lib/craft/version.rb +2 -2
- data/spec/craft_spec.rb +58 -0
- metadata +31 -8
data/README.md
CHANGED
@@ -1,29 +1,37 @@
|
|
1
1
|
# Craft
|
2
2
|
|
3
|
-
|
3
|
+
Craft XML and HTML into objects.
|
4
4
|
|
5
|
-
##
|
5
|
+
## Examples
|
6
|
+
```ruby
|
7
|
+
require 'craft'
|
8
|
+
require 'open-uri'
|
6
9
|
|
7
|
-
|
10
|
+
class Page < Craft
|
11
|
+
# Use CSS selectors
|
12
|
+
one :title, 'title'
|
8
13
|
|
9
|
-
|
14
|
+
# Use XPath
|
15
|
+
many :links, 'a/@href'
|
10
16
|
|
11
|
-
|
17
|
+
# Perform transforms on returned nodes
|
18
|
+
many :images, 'img', lambda { |img| img.attr('src').upcase }
|
19
|
+
end
|
12
20
|
|
13
|
-
|
21
|
+
page = Page.parse open('http://www.google.com')
|
14
22
|
|
15
|
-
|
23
|
+
page.title #=> 'Google'
|
24
|
+
page.links #=> ['http://www.google.com/imghp?hl=en&tab=wi', ...]
|
25
|
+
page.images #=> ['/LOGOS/2012/MOBY_DICK12-HP.JPG']
|
16
26
|
|
17
|
-
|
27
|
+
class Script < Craft
|
28
|
+
one :body, 'text()'
|
29
|
+
end
|
18
30
|
|
19
|
-
|
31
|
+
class Page < Craft
|
32
|
+
many :scripts, 'script', Script
|
33
|
+
end
|
20
34
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
1. Fork it
|
26
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
-
5. Create new Pull Request
|
35
|
+
page = Page.parse open('http://www.google.com')
|
36
|
+
page.scripts[0].body #=> 'window.google=...'
|
37
|
+
```
|
data/Rakefile
CHANGED
data/craft.gemspec
CHANGED
@@ -4,16 +4,19 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'craft/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |gem|
|
7
|
-
gem.name =
|
7
|
+
gem.name = 'craft'
|
8
8
|
gem.version = Craft::VERSION
|
9
|
-
gem.authors = [
|
10
|
-
gem.email = [
|
11
|
-
gem.description = %q{
|
12
|
-
gem.summary = %q{
|
13
|
-
|
9
|
+
gem.authors = ['Ezekiel Templin', 'Hakan Ensari']
|
10
|
+
gem.email = ['code@papercavalier.com']
|
11
|
+
gem.description = %q{Craft XML into objects}
|
12
|
+
gem.summary = %q{Craft is a data extraction tool that crafts objects
|
13
|
+
out of HTML and XML.}
|
14
|
+
gem.homepage = 'http://papercavalier.com/craft/'
|
14
15
|
|
15
16
|
gem.files = `git ls-files`.split($/)
|
16
17
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
18
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
-
gem.require_paths = [
|
19
|
+
gem.require_paths = ['lib']
|
20
|
+
|
21
|
+
gem.add_dependency 'nokogiri', '~> 1.5'
|
19
22
|
end
|
data/lib/craft.rb
CHANGED
@@ -1,5 +1,91 @@
|
|
1
|
-
require
|
1
|
+
require 'craft/version'
|
2
|
+
require 'nokogiri'
|
2
3
|
|
3
|
-
|
4
|
-
|
4
|
+
# Craft objects out of HTML and XML.
|
5
|
+
#
|
6
|
+
# Examples
|
7
|
+
#
|
8
|
+
# module Transformations
|
9
|
+
# IntegerTransform = lambda { |n| Integer n.text }
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# class Person < Craft
|
13
|
+
# include Transformations
|
14
|
+
#
|
15
|
+
# one :name, 'div.name'
|
16
|
+
# one :age, 'div.age', IntegerTransform
|
17
|
+
# many :friends, 'li.friend', Person
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
class Craft
|
21
|
+
class << self
|
22
|
+
# We alias call to new so that crafted objects may nest themselves or other
|
23
|
+
# crafted objects as transformations.
|
24
|
+
alias call new
|
25
|
+
|
26
|
+
# Define a method that extracts a collection of values from a parsed
|
27
|
+
# document.
|
28
|
+
#
|
29
|
+
# name - The Symbol name of the method.
|
30
|
+
# paths - One or more String XPath of CSS queries. An optional Proc
|
31
|
+
# transformation on the extracted value may be appended. If none is
|
32
|
+
# appended, the default transformation returns the stripped String
|
33
|
+
# value of the node.
|
34
|
+
#
|
35
|
+
# Returns an Array.
|
36
|
+
def many(name, *paths)
|
37
|
+
transform = pop_transformation paths
|
38
|
+
|
39
|
+
define_method name do
|
40
|
+
@node.search(*paths).map { |node| transform.call node }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Define a method that extracts a single value from a parsed document.
|
45
|
+
#
|
46
|
+
# name - The Symbol name of the method.
|
47
|
+
# paths - One or more String XPath of CSS queries. An optional Proc
|
48
|
+
# transformation on the extracted value may be appended. If none is
|
49
|
+
# appended, the default transformation returns the stripped String
|
50
|
+
# value of the node.
|
51
|
+
#
|
52
|
+
# Returns an Object.
|
53
|
+
def one(name, *paths)
|
54
|
+
transform = pop_transformation paths
|
55
|
+
|
56
|
+
define_method name do
|
57
|
+
transform.call @node.at(*paths)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Parse a document.
|
62
|
+
#
|
63
|
+
# body - A String HTML or XML document.
|
64
|
+
#
|
65
|
+
# Returns an instance of its self.
|
66
|
+
def parse(body)
|
67
|
+
new Nokogiri body
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def pop_transformation(array)
|
73
|
+
if array.last.respond_to? :call
|
74
|
+
array.pop
|
75
|
+
else
|
76
|
+
Module.new do
|
77
|
+
def self.call(node)
|
78
|
+
node.text.strip if node
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Craft a new object.
|
86
|
+
#
|
87
|
+
# node - A Nokogiri::XML::Node.
|
88
|
+
def initialize(node)
|
89
|
+
@node = node
|
90
|
+
end
|
5
91
|
end
|
data/lib/craft/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION =
|
1
|
+
class Craft
|
2
|
+
VERSION = '0.0.2'
|
3
3
|
end
|
data/spec/craft_spec.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'minitest/autorun'
|
3
|
+
require 'craft'
|
4
|
+
|
5
|
+
describe Craft do
|
6
|
+
let :html do
|
7
|
+
'<html><ul><li>1</li><li>2</li>'
|
8
|
+
end
|
9
|
+
|
10
|
+
let :klass do
|
11
|
+
Class.new Craft
|
12
|
+
end
|
13
|
+
|
14
|
+
let :instance do
|
15
|
+
klass.parse html
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '.many' do
|
19
|
+
it 'extracts nodes' do
|
20
|
+
klass.many 'foo', 'li'
|
21
|
+
instance.foo.must_equal %w(1 2)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'transforms' do
|
25
|
+
klass.many 'foo', 'li', ->(node) { node.text.to_i }
|
26
|
+
instance.foo.must_equal [1, 2]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe '.one' do
|
31
|
+
it 'extracts a node' do
|
32
|
+
klass.one 'foo', 'li'
|
33
|
+
instance.foo.must_equal '1'
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'transforms' do
|
37
|
+
klass.one 'foo', 'li', ->(node) { node.text.to_i }
|
38
|
+
instance.foo.must_equal 1
|
39
|
+
end
|
40
|
+
|
41
|
+
describe 'given no matches' do
|
42
|
+
before do
|
43
|
+
klass.one 'foo', 'foo'
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'returns nil' do
|
47
|
+
instance.foo.must_be_nil
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'nests' do
|
53
|
+
nest = Class.new Craft
|
54
|
+
nest.many 'foo', 'li'
|
55
|
+
klass.one 'foo', 'ul', nest
|
56
|
+
instance.foo.foo.must_equal %w(1 2)
|
57
|
+
end
|
58
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: craft
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,9 +10,25 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2012-10-
|
14
|
-
dependencies:
|
15
|
-
|
13
|
+
date: 2012-10-18 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '1.5'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ~>
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.5'
|
31
|
+
description: Craft XML into objects
|
16
32
|
email:
|
17
33
|
- code@papercavalier.com
|
18
34
|
executables: []
|
@@ -27,7 +43,8 @@ files:
|
|
27
43
|
- craft.gemspec
|
28
44
|
- lib/craft.rb
|
29
45
|
- lib/craft/version.rb
|
30
|
-
|
46
|
+
- spec/craft_spec.rb
|
47
|
+
homepage: http://papercavalier.com/craft/
|
31
48
|
licenses: []
|
32
49
|
post_install_message:
|
33
50
|
rdoc_options: []
|
@@ -39,17 +56,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
39
56
|
- - ! '>='
|
40
57
|
- !ruby/object:Gem::Version
|
41
58
|
version: '0'
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
hash: -3848600058118697198
|
42
62
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
63
|
none: false
|
44
64
|
requirements:
|
45
65
|
- - ! '>='
|
46
66
|
- !ruby/object:Gem::Version
|
47
67
|
version: '0'
|
68
|
+
segments:
|
69
|
+
- 0
|
70
|
+
hash: -3848600058118697198
|
48
71
|
requirements: []
|
49
72
|
rubyforge_project:
|
50
73
|
rubygems_version: 1.8.23
|
51
74
|
signing_key:
|
52
75
|
specification_version: 3
|
53
|
-
summary:
|
54
|
-
test_files:
|
55
|
-
|
76
|
+
summary: Craft is a data extraction tool that crafts objects out of HTML and XML.
|
77
|
+
test_files:
|
78
|
+
- spec/craft_spec.rb
|