samao 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 561ab598e5442161a48b2878887296f8047aa269
4
+ data.tar.gz: 76d3daa2ca3c1407286690404809ee2b58115764
5
+ SHA512:
6
+ metadata.gz: acab6baf781e25d75cb34d7214ac78cdb79936cca735fa554f152342fd285ed562e30c2ac5983a0c21cd6b3154dacd1830871637fd08be89dfbde109569591ca
7
+ data.tar.gz: 5a2f6e9d1a9ee336fe2e4234ec8d6336f00748cb5d39a06932987a557ec342c7f223471700502a64d3b02296920e91f3c2484078b23432b189ec2b09fb529abc
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.3
5
+ - ruby-head
6
+ before_install: gem install bundler -v 1.13.6
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at liulantao@gmail.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in Samao.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Liu Lantao
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,76 @@
1
+ # Samao
2
+
3
+ Samao is a simple and easy to use web-crawlar written in ruby.
4
+
5
+ [![Build Status](https://travis-ci.org/Lax/Samao.svg?branch=master)](https://travis-ci.org/Lax/Samao)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'samao'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install samao
22
+
23
+ ## Usage
24
+
25
+ #!/usr/bin/env ruby
26
+ require 'samao'
27
+
28
+ # create samao detector
29
+ samao = Samao::Detector.new
30
+
31
+ # set base url and start page
32
+ samao.base_url 'https://github.com'
33
+ samao.from '/Lax?tab=repositories'
34
+ # the following line have the same effect
35
+ #samao.from 'https://github.com/Lax?tab=repositories'
36
+
37
+ # tell samao how to find the next page
38
+ samao.match :next, 'div.pagination a.next_page'
39
+
40
+ # tell samao how to find items.
41
+ # further more, set the data from matched HTML node/element.
42
+ samao.add_item 'div#user-repositories-list li a[itemprop="name codeRepository"]' do |item|
43
+ item.set_url :url, item.raw(:item)['href']
44
+ item.set :title, item.raw(:item).text.strip
45
+ end
46
+
47
+ # if it need to open content page for more information
48
+ # default key is :url
49
+ samao.add_detail :url do |detail|
50
+ #samao.add_detail do |detail|
51
+ detail.match :author, 'h1.public .author a' do |item|
52
+ item.set :author, item.raw(:author).first.text.strip
53
+ end
54
+ end
55
+
56
+ # run the detector
57
+ samao.run
58
+
59
+ # read items
60
+ p samao.items
61
+ ## [{:url=>"https://github.com/Lax/awesome", :title=>"awesome", :author=>"Lax"}, {:url=>"https://github.com/Lax/lax.github.com", :title=>"lax.github.com", :author=>"Lax"}, ..]
62
+
63
+ ## Development
64
+
65
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
66
+
67
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
68
+
69
+ ## Contributing
70
+
71
+ Bug reports and pull requests are welcome on GitHub at https://github.com/Lax/Samao. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
72
+
73
+
74
+ ## License
75
+
76
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "samao"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,57 @@
1
+ module Samao
2
+ class Catcher
3
+ require 'open-uri'
4
+ require 'nokogiri'
5
+
6
+ # accpet url
7
+ # return Catcher self
8
+ def initialize(params)
9
+ @url = params[:url]
10
+ @code = 0
11
+
12
+ @headers = {}
13
+ @headers["Referer"] = params[:base_url] if params[:base_url]
14
+
15
+ self
16
+ end
17
+
18
+ # return Catcher self
19
+ def run
20
+ begin
21
+
22
+ open(@url, @headers) do |f|
23
+ begin
24
+ @doc = Nokogiri::HTML(f)
25
+ @code = 200
26
+ rescue
27
+ @code = 500
28
+ end
29
+ end
30
+ rescue
31
+ @code = 400
32
+ end
33
+
34
+ self
35
+ end
36
+
37
+ # catcher task is success or not
38
+ def success?
39
+ @code == 200
40
+ end
41
+
42
+ # catcher task result with code and doc
43
+ def result
44
+ code:@code, doc:@doc
45
+ end
46
+
47
+ # catcher task result doc
48
+ def doc
49
+ @doc
50
+ end
51
+
52
+ # catcher task result code
53
+ def code
54
+ @code
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,30 @@
1
+ module Samao
2
+ class Detail
3
+ include Matchable
4
+
5
+ def initialize(params={})
6
+ matchable
7
+
8
+ @item = params[:item]
9
+ @url = params[:url]
10
+ @base_url = params[:base_url]
11
+ @catcher = Catcher.new(url:@url, base_url: @base_url)
12
+
13
+ yield self if block_given?
14
+
15
+ self
16
+ end
17
+
18
+ def run
19
+ if @catcher and @catcher.run.success? and doc = @catcher.doc
20
+ @selector.each do |name, sel|
21
+ @item.set_raw name, doc.css(sel)
22
+ @on[name].call @item if @on[name]
23
+ end
24
+ end
25
+
26
+ self
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,96 @@
1
+ module Samao
2
+ class Detector
3
+ include Matchable
4
+
5
+ def initialize(params={})
6
+ matchable
7
+
8
+ @current_url = @base_url = @from = nil
9
+ @pages = []
10
+ @items = []
11
+
12
+ yield self if block_given?
13
+
14
+ self
15
+ end
16
+
17
+ # return Detector self
18
+ def run
19
+ while @from and @from.run.success? and @current_doc = @from.doc
20
+ # find items in current_page
21
+ if found = @current_doc.css(@selector[:item]) and found.size >= 1
22
+ @items += found.map do |raw_item|
23
+ item = Item.new(base_url: @current_url, raw_item:raw_item) do |item|
24
+ @on[:item].call(item) if @on[:item]
25
+ end.run
26
+
27
+ if @detail_key
28
+ detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail|
29
+ @on[:detail].call(detail) if @on[:detail]
30
+ end.run
31
+ end
32
+
33
+ item.prop
34
+ end
35
+ end
36
+
37
+ # find next page[s] in current page
38
+ if @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href'])
39
+ @on[:next].call(next_url) if @on[:next]
40
+ from next_url
41
+ else
42
+ stop
43
+ end
44
+ end
45
+
46
+ self
47
+ end
48
+
49
+ def add_detail(detail_key=:url, &block)
50
+ @detail_key = detail_key
51
+ @on[:detail] = block if block
52
+ end
53
+
54
+ def add_item(selector, &block)
55
+ match(:item, selector, &block)
56
+ end
57
+
58
+ # set front page
59
+ def from(url)
60
+ if prev_url = @current_url || @base_url
61
+ url = URI.join(prev_url, url)
62
+ end
63
+ url = URI(url) if ! url.is_a? URI
64
+
65
+ @from = Catcher.new(url:url, base_url:@current_url)
66
+ @pages << url
67
+ @current_url = url
68
+
69
+ self
70
+ end
71
+
72
+ # set base url
73
+ def base_url(url)
74
+ @base_url = url
75
+
76
+ self
77
+ end
78
+
79
+ # get pages
80
+ def pages
81
+ @pages
82
+ end
83
+
84
+ # get items
85
+ def items
86
+ @items
87
+ end
88
+
89
+ private
90
+ def stop
91
+ @from = nil
92
+
93
+ self
94
+ end
95
+ end
96
+ end
data/lib/samao/item.rb ADDED
@@ -0,0 +1,58 @@
1
+ module Samao
2
+ class Item
3
+ include Matchable
4
+
5
+ def initialize(params={})
6
+ matchable
7
+
8
+ @prop = {} # usefull properties
9
+ @raw = {} # nodes go here.
10
+
11
+ @base_url = params[:base_url]
12
+ set_raw :item, params[:raw_item] if params[:raw_item]
13
+
14
+ yield self if block_given?
15
+
16
+ self
17
+ end
18
+
19
+ def extract
20
+ @selector.each do |name, sel|
21
+ set_raw name, @raw[:item].css(sel)
22
+ @on[name].call self if @on[name]
23
+ end
24
+
25
+ self
26
+ end
27
+ alias :run :extract
28
+
29
+ def set(name, value)
30
+ @prop[name] = value
31
+ end
32
+
33
+ def set_url(name, value)
34
+ value = URI.join @base_url, value if @base_url
35
+ set(name, value.to_s)
36
+ end
37
+
38
+ def prop(name=nil)
39
+ if name
40
+ return @prop[name]
41
+ else
42
+ return @prop
43
+ end
44
+ end
45
+
46
+ def set_raw(name, value)
47
+ @raw[name] = value
48
+ end
49
+
50
+ def raw(name=nil)
51
+ if name
52
+ return @raw[name]
53
+ else
54
+ return @raw
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,21 @@
1
+ module Samao
2
+ module Matchable
3
+ def matchable?
4
+ true
5
+ end
6
+
7
+ def matchable
8
+ @selector = {}
9
+ @on = {}
10
+ end
11
+
12
+ def match(name, selector, &block)
13
+ @selector[name] = selector
14
+
15
+ @on[name] = block if block
16
+
17
+ self
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,3 @@
1
+ module Samao
2
+ VERSION = "0.1.0"
3
+ end
data/lib/samao.rb ADDED
@@ -0,0 +1,12 @@
1
+ require "samao/version"
2
+
3
+ require "samao/matchable"
4
+
5
+ require "samao/catcher"
6
+ require "samao/detector"
7
+ require "samao/item"
8
+ require "samao/detail"
9
+
10
+ module Samao
11
+ # Your code goes here...
12
+ end
data/samao.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'samao/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "samao"
8
+ spec.version = Samao::VERSION
9
+ spec.authors = ["Liu Lantao"]
10
+ spec.email = ["liulantao@gmail.com"]
11
+
12
+ spec.summary = %q{Samao - Scalable web spider.}
13
+ spec.description = %q{Samao is a web crawler written in ruby.}
14
+ spec.homepage = "https://github.com/Lax/Samao"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ #if spec.respond_to?(:metadata)
20
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
21
+ #else
22
+ # raise "RubyGems 2.0 or newer is required to protect against " \
23
+ # "public gem pushes."
24
+ #end
25
+
26
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
27
+ f.match(%r{^(test|spec|features)/})
28
+ end
29
+ spec.bindir = "exe"
30
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
31
+ spec.require_paths = ["lib"]
32
+
33
+ spec.add_dependency "nokogiri"
34
+ spec.add_development_dependency "bundler", "~> 1.13"
35
+ spec.add_development_dependency "rake", "~> 10.0"
36
+ spec.add_development_dependency "rspec", "~> 3.0"
37
+ end
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: samao
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Liu Lantao
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-12-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.13'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.13'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ description: Samao is a web crawler written in ruby.
70
+ email:
71
+ - liulantao@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - ".rspec"
78
+ - ".travis.yml"
79
+ - CODE_OF_CONDUCT.md
80
+ - Gemfile
81
+ - LICENSE.txt
82
+ - README.md
83
+ - Rakefile
84
+ - bin/console
85
+ - bin/setup
86
+ - lib/samao.rb
87
+ - lib/samao/catcher.rb
88
+ - lib/samao/detail.rb
89
+ - lib/samao/detector.rb
90
+ - lib/samao/item.rb
91
+ - lib/samao/matchable.rb
92
+ - lib/samao/version.rb
93
+ - samao.gemspec
94
+ homepage: https://github.com/Lax/Samao
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubyforge_project:
114
+ rubygems_version: 2.5.2
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: Samao - Scalable web spider.
118
+ test_files: []