tjcrawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d862225dcccd8804520d59ed284c9e536eaed0c8
4
+ data.tar.gz: 5ce47b642a5bae952139f8ad53adb00d4f697e49
5
+ SHA512:
6
+ metadata.gz: b376fdb73d63fe698ddd1dbb787ab3e5973a58fdc1e5c4514090c0785e3cf594dbbb3398d8b26dd555f255e899a70b1c3406aa741fe1af7273dae1913f8265e3
7
+ data.tar.gz: da0abd28dfb2245fd5b9a54d3e3094d4db1c8add13385eed00b13f15580d877a0ff22fd41e49c04e6c8361dead88f23edfd44d3c60e1592cad9bfb63f549c7ff
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.5
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in tjcrawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Jian Weihang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Tjcrawler
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'tjcrawler'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install tjcrawler
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/tjcrawler/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
data/examples/dsl.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'tjcrawler'
2
+ require 'pg'
3
+ Tjcrawler.start_crawler do |config|
4
+ config.db_setting = {adapter: 'postgresql', database: 'fc2'}
5
+ config.css_selector = '.pager_box > a'
6
+ config.threads = 5
7
+ end
@@ -0,0 +1,9 @@
1
+ require 'tjcrawler'
2
+ require 'pg'
3
+ Tjcrawler.start_parser do |config|
4
+ config.db_setting = {adapter: 'postgresql', database: 'fc2'}
5
+ config.threads = 3
6
+ config.proc = ->(doc){
7
+ true
8
+ }
9
+ end
data/examples/fc2.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'tjcrawler'
2
+ require 'pg'
3
+
4
+ ActiveRecord::Base.establish_connection(
5
+ adapter: 'postgresql',
6
+ database: 'fc2'
7
+ )
8
+
9
+ Tjcrawler::Page.enqueue 'http://video.fc2.com/list_scont/'
10
+
11
+ threads = []
12
+ 5.times.each do
13
+ threads << Thread.new{ Tjcrawler::Crawler.new('.pager_box > a').start }
14
+ end
15
+ threads.each(&:join)
@@ -0,0 +1,13 @@
1
+ require 'tjcrawler'
2
+ require 'pg'
3
+
4
+ ActiveRecord::Base.establish_connection(
5
+ adapter: 'postgresql',
6
+ database: 'fc2'
7
+ )
8
+
9
+ parser = Tjcrawler::Parser.new{ |doc|
10
+ true
11
+ }
12
+
13
+ parser.start
@@ -0,0 +1,15 @@
1
+ require 'active_record'
2
+ require 'pg'
3
+
4
+ ActiveRecord::Base.establish_connection(
5
+ adapter: 'postgresql',
6
+ database: 'fc2'
7
+ )
8
+
9
+ ActiveRecord::Migration.create_table :pages, force: true do |t|
10
+ t.string :url
11
+ t.text :content
12
+ t.datetime :crawled_at
13
+ t.datetime :parsed_at
14
+ t.index :url, unique: true
15
+ end
data/lib/tjcrawler.rb ADDED
@@ -0,0 +1,33 @@
1
+ require 'tjcrawler/version'
2
+ require 'tjcrawler/crawler'
3
+ require 'tjcrawler/parser'
4
+ require 'tjcrawler/config'
5
+
6
+ module Tjcrawler
7
+ module_function
8
+ def start_crawler
9
+ config = Config.new
10
+ yield config
11
+ set_db(config)
12
+ threads = []
13
+ config.threads.times.each do
14
+ threads << Thread.new{ Tjcrawler::Crawler.new(config.css_selector).start }
15
+ end
16
+ threads.each(&:join)
17
+ end
18
+
19
+ def start_parser
20
+ config = Config.new
21
+ yield config
22
+ set_db(config)
23
+ threads = []
24
+ config.threads.times.each do
25
+ threads << Thread.new{ Tjcrawler::Parser.new(&config.proc).start }
26
+ end
27
+ threads.each(&:join)
28
+ end
29
+
30
+ def set_db config
31
+ ActiveRecord::Base.establish_connection config.db_setting
32
+ end
33
+ end
@@ -0,0 +1,8 @@
1
+ module Tjcrawler
2
+ class Config
3
+ attr_accessor :db_setting, :css_selector, :threads, :proc
4
+ def initialize
5
+ @threads = 5
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,30 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'tjcrawler/crawler/result'
4
+ require 'tjcrawler/page'
5
+
6
+ module Tjcrawler
7
+ class Crawler
8
+ def initialize css_selector_for_link_tags
9
+ @css = css_selector_for_link_tags
10
+ end
11
+
12
+ def crawl url
13
+ uri = URI(url).tap(&:normalize!)
14
+ content = open(uri).read
15
+ doc = Nokogiri::HTML(content)
16
+ links = doc.css(@css)
17
+ Result.new url: uri.to_s, content: content, links: links.map{|link| uri.merge(link[:href]).to_s}
18
+ end
19
+
20
+ def start
21
+ loop do
22
+ sleep 1 until page = Page.dequeue
23
+ print :'.'
24
+ result = crawl page.url
25
+ page.touch(:crawled_at) if page.update(content: result.content)
26
+ result.links.each{ |url| Page.enqueue url }
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,12 @@
1
+ module Tjcrawler
2
+ class Crawler
3
+ class Result
4
+ attr_accessor :url, :content, :links
5
+ def initialize(**params)
6
+ params.each do |key, value|
7
+ instance_variable_set("@#{key}", value) unless value.nil?
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,18 @@
1
+ require 'active_record'
2
+ require 'active_support/core_ext/numeric/time'
3
+ require 'active_support/core_ext/date/calculations'
4
+
5
+ module Tjcrawler
6
+ class Page < ActiveRecord::Base
7
+ @@semaphore = Mutex.new
8
+ class << self
9
+ def enqueue url
10
+ @@semaphore.synchronize{Page.find_or_create_by url: url}
11
+ end
12
+
13
+ def dequeue
14
+ Page.where('crawled_at IS NULL OR crawled_at < ?', 1.day.ago).order('crawled_at IS NOT NULL, crawled_at').first
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,33 @@
1
+ require 'nokogiri'
2
+ require 'tjcrawler/page'
3
+
4
+ module Tjcrawler
5
+ class Parser
6
+ # a nokogiri doc will be yield in block, return true/flase
7
+ # for successful/failed parsing.
8
+ def initialize &block
9
+ yield 'Block required' unless block_given?
10
+ @strategy = block
11
+ end
12
+
13
+ def parse content
14
+ doc = Nokogiri::HTML(content)
15
+ ret = @strategy[doc]
16
+ print :'.'
17
+ ret
18
+ end
19
+
20
+ def start
21
+ loop do
22
+ sleep 1 until page = find_next
23
+ page.touch(:parsed_at) if parse(page.content)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def find_next
30
+ Page.where('crawled_at IS NOT NULL AND (parsed_at IS NULL OR parsed_at < ?)', 1.day.ago).order('parsed_at IS NOT NULL, parsed_at').first
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,17 @@
1
+ require 'tjcrawler/page'
2
+ module Tjcrawler
3
+ class Scheduler
4
+ def start
5
+ @queue = Queue.new
6
+ @queue << @seed_url
7
+ @crawler_number.times do
8
+ @threads << Thread.new do
9
+ crawler = Cralwer.new(css: '.entry-title > a')
10
+ urls = crawler.crawl(@queue.pop)
11
+ urls.each{ |url| @queue << url }
12
+ end
13
+ end
14
+ @threads.each(&:join)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,3 @@
1
+ module Tjcrawler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,10 @@
1
+ require 'spec_helper'
2
+ require 'tjcrawler/crawler'
3
+
4
+ describe Tjcrawler::Crawler do
5
+ it '#crawl' do
6
+ crawler = Tjcrawler::Crawler.new(css: '.entry-title > a')
7
+ result = crawler.crawl('http://tonytonyjan.net/blog/', '.entry-title > a')
8
+ expect(result.urls).to eql ["http://tonytonyjan.net/2015/01/03/2014-review/", "http://tonytonyjan.net/2014/12/20/intorduction-to-rails-4-dot-2/", "http://tonytonyjan.net/2014/10/10/rubykaigi-2014-feedback/", "http://tonytonyjan.net/2014/10/07/multi-line-c-macros/", "http://tonytonyjan.net/2014/06/28/fcfc-movie-downloader/", "http://tonytonyjan.net/2014/06/27/fb-page-countdown/", "http://tonytonyjan.net/2014/06/26/sfacg-comic-downloader/", "http://tonytonyjan.net/2014/06/20/swift-optional-chaining-on-ruby/", "http://tonytonyjan.net/2014/06/14/use-plurk-api-to-make-a-robot/", "http://tonytonyjan.net/2014/05/30/5xruby-start/"]
9
+ end
10
+ end
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'tjcrawler'
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ describe Tjcrawler do
4
+ it 'has a version number' do
5
+ expect(Tjcrawler::VERSION).not_to be nil
6
+ end
7
+ end
data/tjcrawler.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tjcrawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tjcrawler"
8
+ spec.version = Tjcrawler::VERSION
9
+ spec.authors = ["Jian Weihang"]
10
+ spec.email = ["tonytonyjan@gmail.com"]
11
+ spec.summary = %q{Simple crawling tool.}
12
+ spec.description = %q{Simple crawling tool.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency 'nokogiri'
22
+ spec.add_dependency 'activerecord'
23
+ spec.add_dependency 'activesupport'
24
+ spec.add_development_dependency "bundler", "~> 1.7"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "rspec"
27
+ end
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tjcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jian Weihang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: activerecord
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.7'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '10.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Simple crawling tool.
98
+ email:
99
+ - tonytonyjan@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - ".rspec"
106
+ - ".travis.yml"
107
+ - Gemfile
108
+ - LICENSE.txt
109
+ - README.md
110
+ - Rakefile
111
+ - examples/dsl.rb
112
+ - examples/dsl_parser.rb
113
+ - examples/fc2.rb
114
+ - examples/fc2_parse.rb
115
+ - examples/schema.rb
116
+ - lib/tjcrawler.rb
117
+ - lib/tjcrawler/config.rb
118
+ - lib/tjcrawler/crawler.rb
119
+ - lib/tjcrawler/crawler/result.rb
120
+ - lib/tjcrawler/page.rb
121
+ - lib/tjcrawler/parser.rb
122
+ - lib/tjcrawler/scheduler.rb
123
+ - lib/tjcrawler/version.rb
124
+ - spec/crawler_spec.rb
125
+ - spec/spec_helper.rb
126
+ - spec/tjcrawler_spec.rb
127
+ - tjcrawler.gemspec
128
+ homepage: ''
129
+ licenses:
130
+ - MIT
131
+ metadata: {}
132
+ post_install_message:
133
+ rdoc_options: []
134
+ require_paths:
135
+ - lib
136
+ required_ruby_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ required_rubygems_version: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ requirements: []
147
+ rubyforge_project:
148
+ rubygems_version: 2.4.5
149
+ signing_key:
150
+ specification_version: 4
151
+ summary: Simple crawling tool.
152
+ test_files:
153
+ - spec/crawler_spec.rb
154
+ - spec/spec_helper.rb
155
+ - spec/tjcrawler_spec.rb