extractify 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in extractify.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Brian John
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,105 @@
1
+ # Extractify
2
+
3
+ A trivial library to extract groups of data from XML or HTML.
4
+
5
+ Do you have a crusty XML api or an HTML page that you want to do some quick and dirty scraping from? Extractify might just be able to help you.
6
+
7
+ Extractify uses the power of the mighty [Nokogiri](https://github.com/sparklemotion/nokogiri).
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'extractify'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install extractify
22
+
23
+ ## Usage
24
+
25
+ Say you have some XML that you want to pull some data out of.
26
+
27
+ ```Ruby
28
+ xml = '<?xml version="1.0"?>
29
+ <movies>
30
+ <movie>
31
+ <title>Very Scary Movie</title>
32
+ <released>1999</released>
33
+ <length>117</length>
34
+ <director>Jane Doe</director>
35
+ <genre>Horror</genre>
36
+ </movie>
37
+ <movie>
38
+ <title>Funny Movie</title>
39
+ <released>2005</released>
40
+ <length>120</length>
41
+ <director>John Doe</director>
42
+ <genre>Comedy</genre>
43
+ </movie>
44
+ <movie>
45
+ <title>SciFi Movie</title>
46
+ <released>2009</released>
47
+ <length>105</length>
48
+ <director>Jim Doe</director>
49
+ <genre>Science Fiction</genre>
50
+ </movie>
51
+ </movies>'
52
+
53
+ Extractify.extract xml, '//movie' => { :name => '//title/text()', :runtime => '//length/text()' }
54
+ => [{:name=>"Very Scary Movie", :runtime=>"117"},
55
+ {:name=>"Funny Movie", :runtime=>"120"},
56
+ {:name=>"SciFi Movie", :runtime=>"105"}]
57
+ ```
58
+
59
+ This also works with CSS selectors for HTML.
60
+ ```Ruby
61
+ html = '
62
+ <html>
63
+ <div class=movies>
64
+ <div class=movie>
65
+ <span class=title>Very Scary Movie</span>
66
+ <span class=released>1999</span>
67
+ <span class=length>117</span>
68
+ <span class=director>Jane Doe</span>
69
+ <span class=genre>Horror</span>
70
+ </div>
71
+ <div class=movie>
72
+ <span class=title>Funny Movie</span>
73
+ <span class=released>2005</span>
74
+ <span class=length>120</span>
75
+ <span class=director>John Doe</span>
76
+ <span class=genre>Comedy</span>
77
+ </div>
78
+ <div class=movie>
79
+ <span class=title>SciFi Movie</span>
80
+ <span class=released>2009</span>
81
+ <span class=length>105</span>
82
+ <span class=director>Jim Doe</span>
83
+ <span class=genre>Science Fiction</span>
84
+ </div>
85
+ </div>
86
+ </html>'
87
+
88
+ Extractify.extract html, '.movie' => { :name => '.title/text()', :runtime => '.length/text()' }
89
+ => [{:name=>"Very Scary Movie", :runtime=>"117"},
90
+ {:name=>"Funny Movie", :runtime=>"120"},
91
+ {:name=>"SciFi Movie", :runtime=>"105"}]
92
+ ```
93
+
94
+
95
+
96
+
97
+
98
+
99
+ ## Contributing
100
+
101
+ 1. Fork it
102
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
103
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
104
+ 4. Push to the branch (`git push origin my-new-feature`)
105
+ 5. Create new Pull Request
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/extractify/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Brian John"]
6
+ gem.email = ["brian@brianjohn.com"]
7
+ gem.description = %q{extract grouped data from XML or HTML}
8
+ gem.summary = %q{extract grouped data from XML or HTML}
9
+ gem.homepage = "https://github.com/f1sherman/extractify"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "extractify"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Extractify::VERSION
17
+
18
+ gem.add_dependency 'nokogiri'
19
+
20
+ gem.add_development_dependency 'rspec'
21
+ end
@@ -0,0 +1,37 @@
1
+ require 'extractify/version'
2
+ require 'nokogiri'
3
+
4
+ module Extractify
5
+ def self.extract(doc, instructions = {})
6
+ results = []
7
+
8
+ doc = Nokogiri.parse(doc) unless doc.is_a?(Nokogiri::XML::Document)
9
+
10
+ results = extract_nodes(doc, instructions)
11
+
12
+ results
13
+ end
14
+
15
+ private
16
+
17
+ def self.extract_nodes(doc, instructions)
18
+ extracted_nodes = []
19
+
20
+ instructions.each_pair do |container_selector, element_selectors|
21
+ doc.search(container_selector).each do |container|
22
+ container_nodes = {}
23
+
24
+ element_selectors.each_pair do |label, selector|
25
+ selector = ".#{selector}" if selector[0] == '/'
26
+
27
+ node = container.search(selector).first
28
+ container_nodes[label] = node.to_s if node
29
+ end
30
+
31
+ extracted_nodes << container_nodes if container_nodes.any?
32
+ end
33
+ end
34
+
35
+ extracted_nodes
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ module Extractify
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+
3
+ describe Extractify do
4
+ describe :extract do
5
+ it 'should extract css selectors' do
6
+ Extractify.extract('<html><div class=bif>baz</div></html>', 'html' => { :bif => '.bif/text()' }).should == [{ :bif => 'baz' }]
7
+ end
8
+ it 'should extract xpath selectors' do
9
+ Extractify.extract('<foo><bar>baz</bar></foo>', 'foo' => { :bar => '//bar/text()' }).should == [{ :bar => 'baz' }]
10
+ end
11
+ it 'should handle multiple instructions' do
12
+ Extractify.extract('<html><div class=bif>baz</div><div class=baz>foo</div></html>', 'html' => { :bif => '.bif/text()', :baz => '.baz/text()' }).should == [{ :bif => 'baz', :baz => 'foo' }]
13
+ end
14
+ it 'should not find results outside of the container' do
15
+ Extractify.extract('<html><div class=bif>baz</div><div class=baz>foo</div></html>', 'doesnotexist' => { :bif => '.bif/text()', :baz => '.baz/text()' }).should == []
16
+ end
17
+ it 'should search only within the container' do
18
+ doc = '<parents>
19
+ <parent>
20
+ <child>child1</child>
21
+ </parent>
22
+ <parent>
23
+ <child>child2</child>
24
+ </parent>
25
+ </parents>'
26
+ Extractify.extract(doc, 'parent' => { :child => '//child/text()' }).should == [{:child => 'child1'}, {:child => 'child2'}]
27
+
28
+ end
29
+
30
+ it 'should group results in same order as instructions' do
31
+ doc = '<?xml version="1.0"?>
32
+ <movies>
33
+ <movie>
34
+ <title>Very Scary Movie</title>
35
+ <released>1999</released>
36
+ <length>117</length>
37
+ <director>Jane Doe</director>
38
+ <genre>Horror</genre>
39
+ </movie>
40
+ <movie>
41
+ <title>Funny Movie</title>
42
+ <released>2005</released>
43
+ <director>John Doe</director>
44
+ <genre>Comedy</genre>
45
+ </movie>
46
+ <movie>
47
+ <released>2009</released>
48
+ <length>105</length>
49
+ <director>Jim Doe</director>
50
+ <genre>Science Fiction</genre>
51
+ </movie>
52
+ </movies>'
53
+
54
+ Extractify.extract(doc, '//movie' => { :name => '//title/text()', :runtime => '//length/text()' }).should == [
55
+ {:name => 'Very Scary Movie', :runtime => '117'},
56
+ {:name => 'Funny Movie'},
57
+ {:runtime => '105'},
58
+ ]
59
+ end
60
+ it 'should should allow Nokogiri documents to be passed in' do
61
+ Extractify.extract(Nokogiri.parse('<html><div class=bif>baz</div></html>'), 'html' => { :bif => '.bif/text()' }).should == [{ :bif => 'baz' }]
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,2 @@
1
+ require 'rspec'
2
+ require 'extractify'
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extractify
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brian John
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: extract grouped data from XML or HTML
47
+ email:
48
+ - brian@brianjohn.com
49
+ executables: []
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - .gitignore
54
+ - Gemfile
55
+ - LICENSE
56
+ - README.md
57
+ - Rakefile
58
+ - extractify.gemspec
59
+ - lib/extractify.rb
60
+ - lib/extractify/version.rb
61
+ - spec/extractify_spec.rb
62
+ - spec/spec_helper.rb
63
+ homepage: https://github.com/f1sherman/extractify
64
+ licenses: []
65
+ post_install_message:
66
+ rdoc_options: []
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ! '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 1.8.23
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: extract grouped data from XML or HTML
87
+ test_files:
88
+ - spec/extractify_spec.rb
89
+ - spec/spec_helper.rb