tjcrawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d862225dcccd8804520d59ed284c9e536eaed0c8
4
+ data.tar.gz: 5ce47b642a5bae952139f8ad53adb00d4f697e49
5
+ SHA512:
6
+ metadata.gz: b376fdb73d63fe698ddd1dbb787ab3e5973a58fdc1e5c4514090c0785e3cf594dbbb3398d8b26dd555f255e899a70b1c3406aa741fe1af7273dae1913f8265e3
7
+ data.tar.gz: da0abd28dfb2245fd5b9a54d3e3094d4db1c8add13385eed00b13f15580d877a0ff22fd41e49c04e6c8361dead88f23edfd44d3c60e1592cad9bfb63f549c7ff
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.5
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in tjcrawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Jian Weihang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Tjcrawler
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'tjcrawler'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install tjcrawler
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/tjcrawler/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
data/examples/dsl.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'tjcrawler'
2
+ require 'pg'
3
+ Tjcrawler.start_crawler do |config|
4
+ config.db_setting = {adapter: 'postgresql', database: 'fc2'}
5
+ config.css_selector = '.pager_box > a'
6
+ config.threads = 5
7
+ end
@@ -0,0 +1,9 @@
1
+ require 'tjcrawler'
2
+ require 'pg'
3
+ Tjcrawler.start_parser do |config|
4
+ config.db_setting = {adapter: 'postgresql', database: 'fc2'}
5
+ config.threads = 3
6
+ config.proc = ->(doc){
7
+ true
8
+ }
9
+ end
data/examples/fc2.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'tjcrawler'
2
+ require 'pg'
3
+
4
+ ActiveRecord::Base.establish_connection(
5
+ adapter: 'postgresql',
6
+ database: 'fc2'
7
+ )
8
+
9
+ Tjcrawler::Page.enqueue 'http://video.fc2.com/list_scont/'
10
+
11
+ threads = []
12
+ 5.times.each do
13
+ threads << Thread.new{ Tjcrawler::Crawler.new('.pager_box > a').start }
14
+ end
15
+ threads.each(&:join)
@@ -0,0 +1,13 @@
1
+ require 'tjcrawler'
2
+ require 'pg'
3
+
4
+ ActiveRecord::Base.establish_connection(
5
+ adapter: 'postgresql',
6
+ database: 'fc2'
7
+ )
8
+
9
+ parser = Tjcrawler::Parser.new{ |doc|
10
+ true
11
+ }
12
+
13
+ parser.start
@@ -0,0 +1,15 @@
1
+ require 'active_record'
2
+ require 'pg'
3
+
4
+ ActiveRecord::Base.establish_connection(
5
+ adapter: 'postgresql',
6
+ database: 'fc2'
7
+ )
8
+
9
+ ActiveRecord::Migration.create_table :pages, force: true do |t|
10
+ t.string :url
11
+ t.text :content
12
+ t.datetime :crawled_at
13
+ t.datetime :parsed_at
14
+ t.index :url, unique: true
15
+ end
data/lib/tjcrawler.rb ADDED
@@ -0,0 +1,33 @@
1
+ require 'tjcrawler/version'
2
+ require 'tjcrawler/crawler'
3
+ require 'tjcrawler/parser'
4
+ require 'tjcrawler/config'
5
+
6
+ module Tjcrawler
7
+ module_function
8
+ def start_crawler
9
+ config = Config.new
10
+ yield config
11
+ set_db(config)
12
+ threads = []
13
+ config.threads.times.each do
14
+ threads << Thread.new{ Tjcrawler::Crawler.new(config.css_selector).start }
15
+ end
16
+ threads.each(&:join)
17
+ end
18
+
19
+ def start_parser
20
+ config = Config.new
21
+ yield config
22
+ set_db(config)
23
+ threads = []
24
+ config.threads.times.each do
25
+ threads << Thread.new{ Tjcrawler::Parser.new(&config.proc).start }
26
+ end
27
+ threads.each(&:join)
28
+ end
29
+
30
+ def set_db config
31
+ ActiveRecord::Base.establish_connection config.db_setting
32
+ end
33
+ end
@@ -0,0 +1,8 @@
1
+ module Tjcrawler
2
+ class Config
3
+ attr_accessor :db_setting, :css_selector, :threads, :proc
4
+ def initialize
5
+ @threads = 5
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,30 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'tjcrawler/crawler/result'
4
+ require 'tjcrawler/page'
5
+
6
+ module Tjcrawler
7
+ class Crawler
8
+ def initialize css_selector_for_link_tags
9
+ @css = css_selector_for_link_tags
10
+ end
11
+
12
+ def crawl url
13
+ uri = URI(url).tap(&:normalize!)
14
+ content = open(uri).read
15
+ doc = Nokogiri::HTML(content)
16
+ links = doc.css(@css)
17
+ Result.new url: uri.to_s, content: content, links: links.map{|link| uri.merge(link[:href]).to_s}
18
+ end
19
+
20
+ def start
21
+ loop do
22
+ sleep 1 until page = Page.dequeue
23
+ print :'.'
24
+ result = crawl page.url
25
+ page.touch(:crawled_at) if page.update(content: result.content)
26
+ result.links.each{ |url| Page.enqueue url }
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,12 @@
1
+ module Tjcrawler
2
+ class Crawler
3
+ class Result
4
+ attr_accessor :url, :content, :links
5
+ def initialize(**params)
6
+ params.each do |key, value|
7
+ instance_variable_set("@#{key}", value) unless value.nil?
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,18 @@
1
+ require 'active_record'
2
+ require 'active_support/core_ext/numeric/time'
3
+ require 'active_support/core_ext/date/calculations'
4
+
5
+ module Tjcrawler
6
+ class Page < ActiveRecord::Base
7
+ @@semaphore = Mutex.new
8
+ class << self
9
+ def enqueue url
10
+ @@semaphore.synchronize{Page.find_or_create_by url: url}
11
+ end
12
+
13
+ def dequeue
14
+ Page.where('crawled_at IS NULL OR crawled_at < ?', 1.day.ago).order('crawled_at IS NOT NULL, crawled_at').first
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,33 @@
1
+ require 'nokogiri'
2
+ require 'tjcrawler/page'
3
+
4
+ module Tjcrawler
5
+ class Parser
6
+ # a nokogiri doc will be yield in block, return true/flase
7
+ # for successful/failed parsing.
8
+ def initialize &block
9
+ yield 'Block required' unless block_given?
10
+ @strategy = block
11
+ end
12
+
13
+ def parse content
14
+ doc = Nokogiri::HTML(content)
15
+ ret = @strategy[doc]
16
+ print :'.'
17
+ ret
18
+ end
19
+
20
+ def start
21
+ loop do
22
+ sleep 1 until page = find_next
23
+ page.touch(:parsed_at) if parse(page.content)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def find_next
30
+ Page.where('crawled_at IS NOT NULL AND (parsed_at IS NULL OR parsed_at < ?)', 1.day.ago).order('parsed_at IS NOT NULL, parsed_at').first
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,17 @@
1
+ require 'tjcrawler/page'
2
+ module Tjcrawler
3
+ class Scheduler
4
+ def start
5
+ @queue = Queue.new
6
+ @queue << @seed_url
7
+ @crawler_number.times do
8
+ @threads << Thread.new do
9
+ crawler = Cralwer.new(css: '.entry-title > a')
10
+ urls = crawler.crawl(@queue.pop)
11
+ urls.each{ |url| @queue << url }
12
+ end
13
+ end
14
+ @threads.each(&:join)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,3 @@
1
+ module Tjcrawler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,10 @@
1
+ require 'spec_helper'
2
+ require 'tjcrawler/crawler'
3
+
4
+ describe Tjcrawler::Crawler do
5
+ it '#crawl' do
6
+ crawler = Tjcrawler::Crawler.new(css: '.entry-title > a')
7
+ result = crawler.crawl('http://tonytonyjan.net/blog/', '.entry-title > a')
8
+ expect(result.urls).to eql ["http://tonytonyjan.net/2015/01/03/2014-review/", "http://tonytonyjan.net/2014/12/20/intorduction-to-rails-4-dot-2/", "http://tonytonyjan.net/2014/10/10/rubykaigi-2014-feedback/", "http://tonytonyjan.net/2014/10/07/multi-line-c-macros/", "http://tonytonyjan.net/2014/06/28/fcfc-movie-downloader/", "http://tonytonyjan.net/2014/06/27/fb-page-countdown/", "http://tonytonyjan.net/2014/06/26/sfacg-comic-downloader/", "http://tonytonyjan.net/2014/06/20/swift-optional-chaining-on-ruby/", "http://tonytonyjan.net/2014/06/14/use-plurk-api-to-make-a-robot/", "http://tonytonyjan.net/2014/05/30/5xruby-start/"]
9
+ end
10
+ end
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'tjcrawler'
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ describe Tjcrawler do
4
+ it 'has a version number' do
5
+ expect(Tjcrawler::VERSION).not_to be nil
6
+ end
7
+ end
data/tjcrawler.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tjcrawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tjcrawler"
8
+ spec.version = Tjcrawler::VERSION
9
+ spec.authors = ["Jian Weihang"]
10
+ spec.email = ["tonytonyjan@gmail.com"]
11
+ spec.summary = %q{Simple crawling tool.}
12
+ spec.description = %q{Simple crawling tool.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency 'nokogiri'
22
+ spec.add_dependency 'activerecord'
23
+ spec.add_dependency 'activesupport'
24
+ spec.add_development_dependency "bundler", "~> 1.7"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "rspec"
27
+ end
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tjcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jian Weihang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: activerecord
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.7'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '10.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Simple crawling tool.
98
+ email:
99
+ - tonytonyjan@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - ".rspec"
106
+ - ".travis.yml"
107
+ - Gemfile
108
+ - LICENSE.txt
109
+ - README.md
110
+ - Rakefile
111
+ - examples/dsl.rb
112
+ - examples/dsl_parser.rb
113
+ - examples/fc2.rb
114
+ - examples/fc2_parse.rb
115
+ - examples/schema.rb
116
+ - lib/tjcrawler.rb
117
+ - lib/tjcrawler/config.rb
118
+ - lib/tjcrawler/crawler.rb
119
+ - lib/tjcrawler/crawler/result.rb
120
+ - lib/tjcrawler/page.rb
121
+ - lib/tjcrawler/parser.rb
122
+ - lib/tjcrawler/scheduler.rb
123
+ - lib/tjcrawler/version.rb
124
+ - spec/crawler_spec.rb
125
+ - spec/spec_helper.rb
126
+ - spec/tjcrawler_spec.rb
127
+ - tjcrawler.gemspec
128
+ homepage: ''
129
+ licenses:
130
+ - MIT
131
+ metadata: {}
132
+ post_install_message:
133
+ rdoc_options: []
134
+ require_paths:
135
+ - lib
136
+ required_ruby_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ required_rubygems_version: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ requirements: []
147
+ rubyforge_project:
148
+ rubygems_version: 2.4.5
149
+ signing_key:
150
+ specification_version: 4
151
+ summary: Simple crawling tool.
152
+ test_files:
153
+ - spec/crawler_spec.rb
154
+ - spec/spec_helper.rb
155
+ - spec/tjcrawler_spec.rb