craft 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +26 -18
- data/Rakefile +8 -1
- data/craft.gemspec +10 -7
- data/lib/craft.rb +89 -3
- data/lib/craft/version.rb +2 -2
- data/spec/craft_spec.rb +58 -0
- metadata +31 -8
data/README.md
CHANGED
@@ -1,29 +1,37 @@
|
|
1
1
|
# Craft
|
2
2
|
|
3
|
-
|
3
|
+
Craft XML and HTML into objects.
|
4
4
|
|
5
|
-
##
|
5
|
+
## Examples
|
6
|
+
```ruby
|
7
|
+
require 'craft'
|
8
|
+
require 'open-uri'
|
6
9
|
|
7
|
-
|
10
|
+
class Page < Craft
|
11
|
+
# Use CSS selectors
|
12
|
+
one :title, 'title'
|
8
13
|
|
9
|
-
|
14
|
+
# Use XPath
|
15
|
+
many :links, 'a/@href'
|
10
16
|
|
11
|
-
|
17
|
+
# Perform transforms on returned nodes
|
18
|
+
many :images, 'img', lambda { |img| img.attr('src').upcase }
|
19
|
+
end
|
12
20
|
|
13
|
-
|
21
|
+
page = Page.parse open('http://www.google.com')
|
14
22
|
|
15
|
-
|
23
|
+
page.title #=> 'Google'
|
24
|
+
page.links #=> ['http://www.google.com/imghp?hl=en&tab=wi', ...]
|
25
|
+
page.images #=> ['/LOGOS/2012/MOBY_DICK12-HP.JPG']
|
16
26
|
|
17
|
-
|
27
|
+
class Script < Craft
|
28
|
+
one :body, 'text()'
|
29
|
+
end
|
18
30
|
|
19
|
-
|
31
|
+
class Page < Craft
|
32
|
+
many :scripts, 'script', Script
|
33
|
+
end
|
20
34
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
1. Fork it
|
26
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
-
5. Create new Pull Request
|
35
|
+
page = Page.parse open('http://www.google.com')
|
36
|
+
page.scripts[0].body #=> 'window.google=...'
|
37
|
+
```
|
data/Rakefile
CHANGED
data/craft.gemspec
CHANGED
@@ -4,16 +4,19 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'craft/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |gem|
|
7
|
-
gem.name =
|
7
|
+
gem.name = 'craft'
|
8
8
|
gem.version = Craft::VERSION
|
9
|
-
gem.authors = [
|
10
|
-
gem.email = [
|
11
|
-
gem.description = %q{
|
12
|
-
gem.summary = %q{
|
13
|
-
|
9
|
+
gem.authors = ['Ezekiel Templin', 'Hakan Ensari']
|
10
|
+
gem.email = ['code@papercavalier.com']
|
11
|
+
gem.description = %q{Craft XML into objects}
|
12
|
+
gem.summary = %q{Craft is a data extraction tool that crafts objects
|
13
|
+
out of HTML and XML.}
|
14
|
+
gem.homepage = 'http://papercavalier.com/craft/'
|
14
15
|
|
15
16
|
gem.files = `git ls-files`.split($/)
|
16
17
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
18
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
-
gem.require_paths = [
|
19
|
+
gem.require_paths = ['lib']
|
20
|
+
|
21
|
+
gem.add_dependency 'nokogiri', '~> 1.5'
|
19
22
|
end
|
data/lib/craft.rb
CHANGED
@@ -1,5 +1,91 @@
|
|
1
|
-
require
|
1
|
+
require 'craft/version'
|
2
|
+
require 'nokogiri'
|
2
3
|
|
3
|
-
|
4
|
-
|
4
|
+
# Craft objects out of HTML and XML.
|
5
|
+
#
|
6
|
+
# Examples
|
7
|
+
#
|
8
|
+
# module Transformations
|
9
|
+
# IntegerTransform = lambda { |n| Integer n.text }
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# class Person < Craft
|
13
|
+
# include Transformations
|
14
|
+
#
|
15
|
+
# one :name, 'div.name'
|
16
|
+
# one :age, 'div.age', IntegerTransform
|
17
|
+
# many :friends, 'li.friend', Person
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
class Craft
|
21
|
+
class << self
|
22
|
+
# We alias call to new so that crafted objects may nest themselves or other
|
23
|
+
# crafted objects as transformations.
|
24
|
+
alias call new
|
25
|
+
|
26
|
+
# Define a method that extracts a collection of values from a parsed
|
27
|
+
# document.
|
28
|
+
#
|
29
|
+
# name - The Symbol name of the method.
|
30
|
+
# paths - One or more String XPath of CSS queries. An optional Proc
|
31
|
+
# transformation on the extracted value may be appended. If none is
|
32
|
+
# appended, the default transformation returns the stripped String
|
33
|
+
# value of the node.
|
34
|
+
#
|
35
|
+
# Returns an Array.
|
36
|
+
def many(name, *paths)
|
37
|
+
transform = pop_transformation paths
|
38
|
+
|
39
|
+
define_method name do
|
40
|
+
@node.search(*paths).map { |node| transform.call node }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Define a method that extracts a single value from a parsed document.
|
45
|
+
#
|
46
|
+
# name - The Symbol name of the method.
|
47
|
+
# paths - One or more String XPath of CSS queries. An optional Proc
|
48
|
+
# transformation on the extracted value may be appended. If none is
|
49
|
+
# appended, the default transformation returns the stripped String
|
50
|
+
# value of the node.
|
51
|
+
#
|
52
|
+
# Returns an Object.
|
53
|
+
def one(name, *paths)
|
54
|
+
transform = pop_transformation paths
|
55
|
+
|
56
|
+
define_method name do
|
57
|
+
transform.call @node.at(*paths)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Parse a document.
|
62
|
+
#
|
63
|
+
# body - A String HTML or XML document.
|
64
|
+
#
|
65
|
+
# Returns an instance of its self.
|
66
|
+
def parse(body)
|
67
|
+
new Nokogiri body
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def pop_transformation(array)
|
73
|
+
if array.last.respond_to? :call
|
74
|
+
array.pop
|
75
|
+
else
|
76
|
+
Module.new do
|
77
|
+
def self.call(node)
|
78
|
+
node.text.strip if node
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Craft a new object.
|
86
|
+
#
|
87
|
+
# node - A Nokogiri::XML::Node.
|
88
|
+
def initialize(node)
|
89
|
+
@node = node
|
90
|
+
end
|
5
91
|
end
|
data/lib/craft/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION =
|
1
|
+
class Craft
|
2
|
+
VERSION = '0.0.2'
|
3
3
|
end
|
data/spec/craft_spec.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'minitest/autorun'
|
3
|
+
require 'craft'
|
4
|
+
|
5
|
+
describe Craft do
|
6
|
+
let :html do
|
7
|
+
'<html><ul><li>1</li><li>2</li>'
|
8
|
+
end
|
9
|
+
|
10
|
+
let :klass do
|
11
|
+
Class.new Craft
|
12
|
+
end
|
13
|
+
|
14
|
+
let :instance do
|
15
|
+
klass.parse html
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '.many' do
|
19
|
+
it 'extracts nodes' do
|
20
|
+
klass.many 'foo', 'li'
|
21
|
+
instance.foo.must_equal %w(1 2)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'transforms' do
|
25
|
+
klass.many 'foo', 'li', ->(node) { node.text.to_i }
|
26
|
+
instance.foo.must_equal [1, 2]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe '.one' do
|
31
|
+
it 'extracts a node' do
|
32
|
+
klass.one 'foo', 'li'
|
33
|
+
instance.foo.must_equal '1'
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'transforms' do
|
37
|
+
klass.one 'foo', 'li', ->(node) { node.text.to_i }
|
38
|
+
instance.foo.must_equal 1
|
39
|
+
end
|
40
|
+
|
41
|
+
describe 'given no matches' do
|
42
|
+
before do
|
43
|
+
klass.one 'foo', 'foo'
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'returns nil' do
|
47
|
+
instance.foo.must_be_nil
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'nests' do
|
53
|
+
nest = Class.new Craft
|
54
|
+
nest.many 'foo', 'li'
|
55
|
+
klass.one 'foo', 'ul', nest
|
56
|
+
instance.foo.foo.must_equal %w(1 2)
|
57
|
+
end
|
58
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: craft
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,9 +10,25 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2012-10-
|
14
|
-
dependencies:
|
15
|
-
|
13
|
+
date: 2012-10-18 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '1.5'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ~>
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.5'
|
31
|
+
description: Craft XML into objects
|
16
32
|
email:
|
17
33
|
- code@papercavalier.com
|
18
34
|
executables: []
|
@@ -27,7 +43,8 @@ files:
|
|
27
43
|
- craft.gemspec
|
28
44
|
- lib/craft.rb
|
29
45
|
- lib/craft/version.rb
|
30
|
-
|
46
|
+
- spec/craft_spec.rb
|
47
|
+
homepage: http://papercavalier.com/craft/
|
31
48
|
licenses: []
|
32
49
|
post_install_message:
|
33
50
|
rdoc_options: []
|
@@ -39,17 +56,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
39
56
|
- - ! '>='
|
40
57
|
- !ruby/object:Gem::Version
|
41
58
|
version: '0'
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
hash: -3848600058118697198
|
42
62
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
63
|
none: false
|
44
64
|
requirements:
|
45
65
|
- - ! '>='
|
46
66
|
- !ruby/object:Gem::Version
|
47
67
|
version: '0'
|
68
|
+
segments:
|
69
|
+
- 0
|
70
|
+
hash: -3848600058118697198
|
48
71
|
requirements: []
|
49
72
|
rubyforge_project:
|
50
73
|
rubygems_version: 1.8.23
|
51
74
|
signing_key:
|
52
75
|
specification_version: 3
|
53
|
-
summary:
|
54
|
-
test_files:
|
55
|
-
|
76
|
+
summary: Craft is a data extraction tool that crafts objects out of HTML and XML.
|
77
|
+
test_files:
|
78
|
+
- spec/craft_spec.rb
|