mmonitor 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: aa406deb1d049e0bb84bdb8c4cd55c534af658fa
4
+ data.tar.gz: 058830a5dbe7e114996a3a4c46148769e318d409
5
+ SHA512:
6
+ metadata.gz: 1da90674d7fcf473f2aac1a0f29882b1315ae80e2d6642cdf1954ef4263d8bed22b22ad3d499ae88176c9b51d694ff1d966da969f9438d4f815241b01e9a0824
7
+ data.tar.gz: bd655bbb204e52493ccd90491f006eda03803104268fd6dc945f6080d5e2f98d7f7c37e96cb17e76a7203f855c4d471ea1a7d3dfb3b6a257e9082a5de9ca2266
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawlers.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,77 @@
1
+ # MMonitor
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'mmonitor'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install mmonitor
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ``` ruby
24
+ # -*- encoding: utf-8 -*-
25
+ require 'mmonitor'
26
+ items = {}
27
+ url = 'http://loreal.tmall.com/category.htm?search=y&scene=taobao_shop'
28
+ a = MMonitor::Crawler.new(url)
29
+ items.merge!(a.items)
30
+
31
+ url = 'http://www.amazon.cn/s/ref=sr_nr_p_89_0?rh=i%3Aaps%2Ck%3A%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85%2Cp_89%3AL%27Oreal+Paris+%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&keywords=%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&ie=UTF8&qid=1397212889&rnid=125596071'
32
+ b = MMonitor::Crawler.new(url)
33
+ items.merge!(b.items)
34
+
35
+ url = 'http://search.jd.com/search?keyword=%E7%BE%8E%E5%AE%9D%E8%8E%B2&enc=utf-8&qr=&qrst=UNEXPAND&et=&rt=1&stop=1&area=1&wtype=1&ev=exbrand_%E7%BE%8E%E5%AE%9D%E8%8E%B2%EF%BC%88MAYBELLINE%EF%BC%89%40&uc=0#select'
36
+ c = MMonitor::Crawler.new(url)
37
+ items.merge!(c.items)
38
+
39
+ url = 'http://search.jumei.com/?filter=0-11-1&search=%E6%AC%A7%E8%8E%B1%E9%9B%85&from=search_topbar_%E6%AC%A7%E8%8E%B1%E9%9B%85_word_pos1&cat=&bid=1'
40
+ d = MMonitor::Crawler.new(url)
41
+ items.merge!(d.items)
42
+
43
+ url = 'http://search.lefeng.com/search/search?key=%E6%AC%A7%E8%8E%B1%E9%9B%85&wt.s_pg=Isearch&wt.s_pf=public'
44
+ e = MMonitor::Crawler.new(url)
45
+ items.merge!(e.items)
46
+
47
+ url = 'http://search.suning.com/%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85/cityId=9264&iy=-1&ct=1&si=5&st=0'
48
+ f = MMonitor::Crawler.new(url)
49
+ items.merge!(f.items)
50
+
51
+ url = 'http://www.yhd.com/ctg/s2/c0-0/b/a-s1-v0-p1-price-d0-f06-m1-rt0-pid-mid0-k%E6%AC%A7%E8%8E%B1%E9%9B%85/'
52
+ j = MMonitor::Crawler.new(url)
53
+ items.merge!(j.items)
54
+
55
+ url = 'http://searchex.yixun.com/html?key=%E6%AC%A7%E8%8E%B1%E9%9B%85&area=1&sort=0&show=0&size=40&pf=1&as=1&charset=utf-8&YTAG=1.100000401#list'
56
+ h = MMonitor::Crawler.new(url)
57
+ items.merge!(h.items)
58
+
59
+ require 'csv'
60
+ header_row = ['平台ID', 'SKU ID','商品名称', '平台', '原价', '销售价', '商品图片', '价格图片', '抓取日期']
61
+ CSV.open('测试数据.csv', "wb:GB18030", col_sep: ',') do |csv|
62
+ csv << header_row
63
+ items.each do |item_id, item|
64
+ csv << [
65
+ item_id,
66
+ item[:sku_id],
67
+ item[:title],
68
+ item[:provider],
69
+ item[:tag_price],
70
+ item[:price],
71
+ item[:photo_url],
72
+ item[:price_url],
73
+ Time.now.to_date.to_s
74
+ ]
75
+ end
76
+ end
77
+ ```
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/lib/mmonitor.rb ADDED
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'active_support/core_ext'
3
+
4
+ module MMonitor
5
+ VERSION = '0.0.2'
6
+
7
+ module Strategies # 目前支持的解析规则
8
+ autoload :Amazon, 'mmonitor/strategies/amazon'
9
+ autoload :Jd, 'mmonitor/strategies/jd'
10
+ autoload :Jumei, 'mmonitor/strategies/jumei'
11
+ autoload :Lefeng, 'mmonitor/strategies/lefeng'
12
+ autoload :Suning, 'mmonitor/strategies/suning'
13
+ autoload :Tmall, 'mmonitor/strategies/tmall'
14
+ autoload :Yhd, 'mmonitor/strategies/yhd'
15
+ autoload :Yixun, 'mmonitor/strategies/yixun'
16
+ end
17
+
18
+ autoload :Crawler, 'mmonitor/crawler'
19
+ autoload :Parser, 'mmonitor/parser'
20
+ autoload :Spider, 'mmonitor/spider'
21
+ end
@@ -0,0 +1,83 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'addressable/uri'
3
+
4
+ module MMonitor
5
+ class Crawler # 爬虫,负责数据抓取流程
6
+
7
+ attr_accessor :adapter, :url, :params, :pages, :page, :page_key, :total
8
+
9
+ def initialize(url)
10
+ self.page = 1
11
+ self.page_key = 'page'
12
+ process(url)
13
+ end
14
+
15
+ def items
16
+ self.adapter.items
17
+ end
18
+
19
+ private
20
+
21
+ # URL格式化
22
+ def process(uri)
23
+ uri = Addressable::URI.parse(uri)
24
+ host = uri.host
25
+ self.url = "#{uri.scheme}://#{host}#{uri.path}"
26
+ self.params = uri.query_values || {}
27
+
28
+ self.adapter = case
29
+ when host.include?('amazon.cn')
30
+ Strategies::Amazon.new(html)
31
+ when host.include?('jd.com')
32
+ Strategies::Jd.new(html)
33
+ when host.include?('jumei.com')
34
+ Strategies::Jumei.new(html)
35
+ when host.include?('lefeng.com')
36
+ Strategies::Lefeng.new(html)
37
+ when host.include?('suning.com')
38
+ Strategies::Suning.new(html)
39
+ when host.include?('tmall.com')
40
+ Strategies::Tmall.new(html)
41
+ when host.include?('yhd.com')
42
+ Strategies::Yhd.new(html, self.url)
43
+ when host.include?('yixun.com')
44
+ Strategies::Yixun.new(html)
45
+ else
46
+ puts host
47
+ puts '_'*88
48
+ nil
49
+ end
50
+ # 当前页数
51
+ self.pages = self.adapter.pages
52
+ self.total = self.adapter.total
53
+ self.page_key = self.adapter.page_key
54
+ next_page
55
+ end
56
+
57
+ def next_page
58
+ puts "分页提示:#{self.page}/#{self.pages}"
59
+ puts '_'*88
60
+ if self.pages > self.page
61
+ self.page += 1
62
+ self.params[self.page_key] = self.adapter.page(self.page)
63
+ self.adapter.body = html
64
+ self.adapter.process
65
+ else
66
+ self.adapter.extra
67
+ self.adapter.body = nil
68
+ self.adapter.item = nil
69
+ puts "产品差异:#{self.items.count}/#{self.total}" unless self.total.nil?
70
+ return nil
71
+ end
72
+ next_page
73
+ end
74
+
75
+ def html
76
+ if self.adapter.nil?
77
+ Spider.get_html(self.url, self.params)
78
+ else
79
+ self.adapter.get_html(self.url, self.params)
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,133 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+
5
+ class Parser # 解析器,负责网页模板解析。
6
+
7
+ attr_accessor :body, :items, :item
8
+
9
+ def initialize(dom)
10
+ return nil if dom.nil?
11
+ self.body = dom.at(css_path[:body])
12
+ self.items ||= {}
13
+ process unless self.body.nil?
14
+ end
15
+
16
+ def process
17
+ self.list.each do |item|
18
+ self.item = item
19
+ id = spu_id
20
+ unless id.nil?
21
+ self.items[id] = {
22
+ provider: self.provider,
23
+ title: title,
24
+ sku_id: sku_id,
25
+ photo_url: photo_url,
26
+ tag_price: tag_price,
27
+ price: price,
28
+ price_url: price_url
29
+ }
30
+ end
31
+ end
32
+ end
33
+
34
+ def list # 产品列表
35
+ products = []
36
+ if css_path[:list].is_a?(Array)
37
+ css_path[:list].each do |li|
38
+ dom = self.body.at(li)
39
+ products += dom.css(css_path[:item]) unless dom.nil?
40
+ end
41
+ else
42
+ products += self.body.at(css_path[:list]).css(css_path[:item])
43
+ end
44
+ products
45
+ end
46
+
47
+ def extra # 拓展操作
48
+
49
+ end
50
+
51
+ def pages
52
+ self.body.at(css_path[:pages]).text.to_i
53
+ end
54
+
55
+ def total
56
+ if css_path.has_key?(:total)
57
+ squish( self.body.at(css_path[:total]).try(:text) )
58
+ else
59
+ nil
60
+ end
61
+ end
62
+
63
+ def page_key
64
+ 'page'
65
+ end
66
+
67
+ def page(num)
68
+ num
69
+ end
70
+
71
+ def get_html(url, params)
72
+ Spider.get_html(url, params)
73
+ end
74
+
75
+ def spu_id # 产品ID
76
+ squish( self.item[css_path[:spu_id]] )
77
+ end
78
+
79
+ def sku_id # 单品ID
80
+ if css_path.has_key?(:sku_id)
81
+ squish( self.item[css_path[:sku_id]] )
82
+ else
83
+ nil
84
+ end
85
+ end
86
+
87
+ def title # 产品标题
88
+ squish( self.item.at(css_path[:title]).try(:text) )
89
+ end
90
+
91
+ def photo_url # 产品图片
92
+ self.item.at(css_path[:photo_url])['src']
93
+ end
94
+
95
+ def price # 产品售价
96
+ if css_path.has_key?(:price)
97
+ squish( self.item.at(css_path[:price]).try(:text) ).to_f
98
+ else
99
+ nil
100
+ end
101
+ end
102
+
103
+ def tag_price # 产品原价(可选)
104
+ if css_path.has_key?(:tag_price)
105
+ squish( self.item.at(css_path[:tag_price]).try(:text) ).to_f
106
+ else
107
+ nil
108
+ end
109
+ end
110
+
111
+ def price_url # 产品售价图片(可选)
112
+ if css_path.has_key?(:price_url)
113
+ self.item.at(css_path[:price_url])['src']
114
+ else
115
+ nil
116
+ end
117
+ end
118
+
119
+ private
120
+
121
+ # 字符处理
122
+ def squish(string)
123
+ string.gsub(/¥|¥/, '').strip.gsub(/\s+/, '') rescue nil
124
+ end
125
+
126
+ # 无货产品,通过图片获取价格
127
+ def pirce_from_ocr(price_url)
128
+ str = MMonitor::Spider.get_ocr(price_url)
129
+ str.match( /\d+\.\d+/ ).to_f
130
+ end
131
+ end
132
+
133
+ end
@@ -0,0 +1,80 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'oj'
3
+ require 'faraday'
4
+ require 'nokogiri'
5
+ require 'rtesseract'
6
+ require 'mini_magick'
7
+
8
+ module MMonitor
9
+ module Spider # 蜘蛛,负责http请求处理
10
+ class << self
11
+ # 抓取HTML
12
+ def get_html(url, params={})
13
+ body = get(url, params)
14
+ $body = body
15
+ ::Nokogiri::HTML(body)
16
+ end
17
+ # 抓取JSON
18
+ def get_json(url, params={})
19
+ body = get(url, params)
20
+ ::Oj.load(body) rescue {}
21
+ end
22
+ # 抓取图片上的文字
23
+ def get_ocr(photo_url)
24
+ image = MiniMagick::Image.open(photo_url)
25
+ image.combine_options do |c|
26
+ c.background '#FFFFFF'
27
+ c.colorspace 'GRAY'
28
+ c.alpha 'remove'
29
+ end
30
+ image.format 'jpg'
31
+ ocr = RTesseract.new(image.path, processor: 'mini_magick')
32
+ str = ocr.to_s
33
+ image.destroy!
34
+ return str
35
+ end
36
+ # 分页
37
+ def number_page(total, limit)
38
+ count = total / limit
39
+ count += 1 if total % limit > 0
40
+ count
41
+ end
42
+
43
+ private
44
+
45
+ def get(url, params={})
46
+ resp = conn.get url, params
47
+ # 根据状态返回/跳转/终止
48
+ case resp.status
49
+ when 200 # 正常
50
+ resp.body
51
+ when 302 # 跳转
52
+ get( resp.headers['location'] )
53
+ else
54
+ nil
55
+ end
56
+ end
57
+
58
+ # 连接
59
+ def conn
60
+ @conn ||= Faraday.new(ssl: false)
61
+ @conn.headers[:user_agent] = switcher
62
+ @conn.headers[:accept] = 'text/html,application/json;q=0.9'
63
+ @conn
64
+ end
65
+ # 混淆
66
+ def switcher
67
+ [
68
+ 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; 360se)',
69
+ 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)',
70
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
71
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C)',
72
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C)',
73
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; SE 2.X MetaSr 1.0)',
74
+ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.802.30 Safari/535.1 SE 2.X MetaSr 1.0',
75
+ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.3 (KHTML, like Gecko) Maxthon/3.3.2.1000 Chrome/16.0.883.0 Safari/535.3',
76
+ ].sample
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://www.amazon.cn/s/ref=sr_nr_p_89_0?rh=i%3Aaps%2Ck%3A%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85%2Cp_89%3AL%27Oreal+Paris+%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&keywords=%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&ie=UTF8&qid=1397212889&rnid=125596071
6
+ class Amazon < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :amazon
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'div#rightResultsATF',
15
+ list: ['div#atfResults', 'div#btfResults'],
16
+ item: 'div.prod.celwidget',
17
+ total: '#resultCount > span',
18
+ pages: '#pagn > span.pagnDisabled',
19
+ spu_id: 'name',
20
+ title: 'h3.newaps>a',
21
+ photo_url: 'div.image.imageContainer > a > div > img',
22
+ tag_price: 'ul.rsltGridList > li.newp > div > a > del',
23
+ price: 'ul.rsltGridList > li.newp > div > a > span'
24
+ }
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,72 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://search.jd.com/search?keyword=%B0%CD%C0%E8%C5%B7%C0%B3%D1%C5&qr=&qrst=UNEXPAND&et=&rt=1&stop=1&ev=exbrand_%C5%B7%C0%B3%D1%C5%A3%A8LOREAL%A3%A9%40&area=1&wtype=1
6
+ class Jd < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :jd
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'body>div.w.main>div.right-extra',
15
+ list: 'div#plist>ul.list-h',
16
+ item: 'li',
17
+ total: 'div#filter>div.fore1>div.total>span>strong',
18
+ pages: 'div#filter>div.fore1>div#top_pagi>span',
19
+ spu_id: 'sku',
20
+ title: 'div>div.p-name>a',
21
+ photo_url: 'div>div.p-img >a>img'
22
+ }
23
+ end
24
+
25
+ def spu_id # 产品ID
26
+ id = squish( self.item[css_path[:spu_id]] )
27
+ id == 'scroll_loading' ? nil : id
28
+ end
29
+
30
+ def photo_url # 产品图片
31
+ self.item.at(css_path[:photo_url])['data-lazyload']
32
+ end
33
+
34
+ def pages
35
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
36
+ end
37
+
38
+ def extra
39
+ get_prices(self.items.keys)
40
+ end
41
+
42
+ private
43
+ # 设置价格
44
+ def set_prices(prices)
45
+ prices.each do |item|
46
+ id = item['id'].gsub('J_', '')
47
+ if self.items.has_key?(id) && item['m'].to_i > 0
48
+ self.items[id][:tag_price] = item['m'].to_f
49
+ self.items[id][:price] = item['p'].to_f
50
+ else
51
+ puts item['id']
52
+ puts '_'*88
53
+ end
54
+ end
55
+ end
56
+ # 获取价格
57
+ def get_prices(sku_ids)
58
+ url = "http://p.3.cn/prices/mgets"
59
+ sku_ids = sku_ids.map { |sku_id| "J_#{sku_id}" }
60
+ sku_ids.each_slice(100) do |skuids|
61
+ params = {
62
+ skuids: skuids.join(',')
63
+ }
64
+ prices = MMonitor::Spider.get_json(url, params)
65
+ unless prices.empty?
66
+ set_prices(prices)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,45 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://search.jumei.com/?filter=0-11-1&search=%E6%AC%A7%E8%8E%B1%E9%9B%85&from=search_topbar_%E6%AC%A7%E8%8E%B1%E9%9B%85_word_pos1&cat=&bid=1
6
+ class Jumei < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :jumei
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'body>div#container>div#body>div#search_result_wrap>div#search_list_wrap',
15
+ list: 'div.products_wrap>ul',
16
+ item: 'li',
17
+ total: 'div.search_list_head_fiex>div>div.head_pagecount>span',
18
+ pages: 'div.search_list_head_fiex>div>div.head_pageInfo',
19
+ spu_id: 'pid',
20
+ title: 'div>div.s_l_name>a',
21
+ price: 'div > div.search_list_price > span',
22
+ tag_price: 'div > div.search_list_price > del',
23
+ photo_url: 'div>div.s_l_pic>a>img'
24
+ }
25
+ end
26
+
27
+ def page_key
28
+ 'filter'
29
+ end
30
+
31
+ def pages
32
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
33
+ end
34
+
35
+ def page(num)
36
+ "0-11-#{num}"
37
+ end
38
+
39
+ def photo_url # 产品图片
40
+ self.item.at(css_path[:photo_url])['original']
41
+ end
42
+
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,82 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://search.lefeng.com/search/search?key=%E6%AC%A7%E8%8E%B1%E9%9B%85&wt.s_pg=Isearch&wt.s_pf=public
6
+ class Lefeng < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :lefeng
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'body>div.mainBg>div.main>div.cont>div#clothList',
15
+ list: 'div.smPruArea>div.makeup',
16
+ item: 'div.pruwrap',
17
+ total: 'div#sm-nav > span.tpages > span.tpageNum > i',
18
+ pages: 'div#sm-nav > span.tpages > span.tpageNum > em',
19
+ spu_id: 'id',
20
+ sku_id: 'skuids',
21
+ title: 'dl>dd.nam>a',
22
+ photo_url: 'dl>dt>a>img',
23
+ tag_price: 'dl>dd.pri>del',
24
+ price_url: 'dl>dd.pri>img',
25
+ }
26
+ end
27
+
28
+ def pages
29
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
30
+ end
31
+
32
+ def page_key
33
+ 'pageNo'
34
+ end
35
+
36
+ def price
37
+ price_from_cart(spu_id, sku_id) || pirce_from_ocr(price_url)
38
+ end
39
+
40
+ def sku_id
41
+ self.item.at('dl')[css_path[:sku_id]]
42
+ end
43
+
44
+ def photo_url # 产品图片
45
+ self.item.at(css_path[:photo_url])['pagespeed_lazy_src']
46
+ end
47
+
48
+ def price_url # 产品售价图片(可选)
49
+ self.item.at(css_path[:price_url])['pagespeed_lazy_src'].gsub('_73_', '_75_')
50
+ end
51
+
52
+ private
53
+ # 通过购物车,获取价格
54
+ def price_from_cart(spu_id, sku_id)
55
+ url = "http://shopping.lefeng.com/cart/ajaxUpsellcart.jsp"
56
+ params = {
57
+ 'productId' => spu_id,
58
+ 'skuId' => sku_id,
59
+ 'quantity' => 1,
60
+ 'itemType' => 0,
61
+ 'process' => 'add',
62
+ 'skuExtraPriceId' => 0,
63
+ 'fromSite' => 'webAllProvince',
64
+ '_' => 1398446675733
65
+ }
66
+ json = MMonitor::Spider.get_json(url, params)
67
+ if json["errorInfo"].blank?
68
+ return json['totalPrice'] if json['totalPrice'].is_a?(Float)
69
+ end
70
+ puts '将通过图片识别价格,很耗资源呢~'
71
+ puts '_'*88
72
+ nil
73
+ end
74
+ # 无货产品,通过图片获取价格
75
+ def pirce_from_ocr(price_url)
76
+ price_url.gsub!('_73_', '_75_')
77
+ str = MMonitor::Spider.get_ocr(price_url)
78
+ str.match( /\d+\.\d+/ )[0].to_f
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,69 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://search.suning.com/%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85/cityId=9264&iy=-1&ct=1&si=5&st=0
6
+ class Suning < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :suning
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'body>div.wrap-search>div.w1190>div.proList.mb10',
15
+ list: 'div#productTab>div#proShow>ul.container',
16
+ item: 'li.item',
17
+ total: 'div#filterContent>div.searchKeyT>i',
18
+ pages: '#refresh>div.page>span>i#pageTotal',
19
+ spu_id: 'name',
20
+ sku_id: 'class',
21
+ title: 'div > span > a > p',
22
+ photo_url: 'a > img',
23
+ price_url: 'div > p > img'
24
+ }
25
+ end
26
+
27
+ def spu_id
28
+ self.item[css_path[:spu_id]].to_i
29
+ end
30
+
31
+ def sku_id
32
+ self.item[css_path[:sku_id]].split(' ')[0]
33
+ end
34
+
35
+ def price_url # 产品售价图片(可选)
36
+ dom = self.item.at(css_path[:price_url])
37
+ dom['src2'] unless dom.nil?
38
+ end
39
+
40
+ def price
41
+ price_from_ajax(sku_id) || pirce_from_ocr(price_url)
42
+ end
43
+
44
+ def page_key
45
+ 'cp'
46
+ end
47
+
48
+ def page(num)
49
+ num-1
50
+ end
51
+
52
+ def get_html(url, params)
53
+ uri = Addressable::URI.new
54
+ uri.query_values = params
55
+ Spider.get_html("#{url}&#{uri.query}")
56
+ end
57
+
58
+ private
59
+ # 浏览记录,获取价格
60
+ def price_from_ajax(sku_id)
61
+ url = "http://product.suning.com/0000000000/browseHistory_10052_10051_#{sku_id}_9264_.html"
62
+ json = MMonitor::Spider.get_json(url)
63
+ return json['price'].to_f if json.has_key?('price')
64
+ nil
65
+ end
66
+
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,39 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://loreal.tmall.com/category.htm?search=y&scene=taobao_shop
6
+ class Tmall < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :tmall
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: '#TmshopSrchNav>div',
15
+ list: 'div.J_TItems',
16
+ item: 'dl.item',
17
+ pages: 'div.filter.clearfix.J_TFilter > p > b.ui-page-s-len', # .text.split('/')[1]
18
+ spu_id: 'data-id',
19
+ title: 'dd.detail > a',
20
+ photo_url: 'dt > a:nth-child(1) > img', # 'data-ks-lazyload'
21
+ price: 'dd.detail > div > div.cprice-area > span.c-price'
22
+ }
23
+ end
24
+
25
+ def photo_url # 产品图片
26
+ self.item.at(css_path[:photo_url])['data-ks-lazyload']
27
+ end
28
+
29
+ def pages
30
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
31
+ end
32
+
33
+ def page_key
34
+ 'pageNo'
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,94 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://www.yhd.com/ctg/s2/c0-0/b/a-s1-v0-p1-price-d0-f06-m1-rt0-pid-mid0-k%E6%AC%A7%E8%8E%B1%E9%9B%85/
6
+ class Yhd < MMonitor::Parser
7
+
8
+ attr_accessor :body, :items, :item, :url, :pages_count, :page
9
+
10
+ def initialize(dom, url)
11
+ self.body = dom.at(css_path[:body])
12
+ self.items ||= {}
13
+ self.url = url.sub('s2', 'searchPage')
14
+ self.pages_count = self.body.at(css_path[:pages]).text.split('/')[1].to_i
15
+ self.page = 1
16
+ process
17
+ get_ajax(true)
18
+ next_page
19
+ end
20
+
21
+ # 输出的产品
22
+ def provider
23
+ :yhd
24
+ end
25
+
26
+ def css_path
27
+ {
28
+ body: '#plist',
29
+ list: '#itemSearchList',
30
+ item: 'li.search_item > div.search_item_box',
31
+ pages: '#rankOpDiv > div > div.select_page_num', # .text.split('/')[1]
32
+ spu_id: 'pmid',
33
+ title: 'p.title > a',
34
+ photo_url: 'a.search_prod_img > img', # 'data-ks-lazyload'
35
+ price: 'div.pricebox.clearfix > span:nth-child(1)'
36
+ }
37
+ end
38
+
39
+ def spu_id # 产品图片
40
+ self.item.at(css_path[:title])[css_path[:spu_id]]
41
+ end
42
+
43
+ def photo_url # 产品图片
44
+ img = self.item.at(css_path[:photo_url])
45
+ img['original'] || img['src']
46
+ end
47
+
48
+ # 分页伪装
49
+ def pages
50
+ 1
51
+ end
52
+
53
+ private
54
+
55
+ def next_page
56
+ puts "分页提示:#{self.page}/#{self.pages_count}"
57
+ puts '_'*88
58
+ if self.pages_count > self.page
59
+ self.page += 1
60
+ self.url.gsub!("-p#{self.page-1}-", "-p#{self.page}-")
61
+ get_ajax # 执行
62
+ get_ajax(true)
63
+ else
64
+ self.body = nil
65
+ self.item = nil
66
+ return nil
67
+ end
68
+ next_page
69
+ end
70
+
71
+ def more_params
72
+ {
73
+ 'isGetMoreProducts' => 1,
74
+ 'moreProductsDefaultTemplate' => 0
75
+ }
76
+ end
77
+
78
+ # 加载更多商品
79
+ def get_ajax(more=false)
80
+ params = more ? more_params : {}
81
+ json = MMonitor::Spider.get_json(self.url, params)
82
+ if json.has_key?('value')
83
+ html = json['value']
84
+ unless html.nil?
85
+ self.body = ::Nokogiri::HTML("<div id='itemSearchList'>#{html}</div>")
86
+ # 执行
87
+ self.process
88
+ end
89
+ end
90
+ end
91
+
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,36 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://searchex.yixun.com/html?key=%E6%AC%A7%E8%8E%B1%E9%9B%85&area=1&sort=0&show=0&size=40&pf=1&as=1&charset=utf-8&YTAG=1.100000401#list
6
+ class Yixun < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :yixun
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'div#container > div > div.grid_m.smain > div.grid_m_inner',
15
+ list: 'div.goods > ul#itemList',
16
+ item: 'li.goods_li',
17
+ total: '#list > div.sort_page > div.sort_page_txt > b',
18
+ pages: '#list > div.sort_page > div.sort_page_num > span', # .text.split('/')[1]
19
+ spu_id: 'commid',
20
+ title: 'div > div.mod_goods_info > p.mod_goods_tit > a',
21
+ photo_url: 'div > div.mod_goods_img > a > img', # 'data-ks-lazyload'
22
+ price: 'div > div.mod_goods_info > p.mod_goods_price > span.mod_price > span'
23
+ }
24
+ end
25
+
26
+ def photo_url # 产品图片
27
+ self.item.at(css_path[:photo_url])['init_src']
28
+ end
29
+
30
+ def pages
31
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
32
+ end
33
+
34
+ end
35
+ end
36
+ end
data/mmonitor.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = "mmonitor"
3
+ lib_file = File.expand_path("../lib/#{lib}.rb", __FILE__)
4
+ File.read(lib_file) =~ /\bVERSION\s*=\s*["'](.+?)["']/
5
+ version = $1
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = lib
9
+ spec.version = version
10
+ spec.authors = ["Xu Fei", "Howl王"]
11
+ spec.email = ["xfstart07@gmail.com", "howl.wong@gmail.com"]
12
+ spec.summary = %q{Write a short summary. Required.}
13
+ spec.description = %q{Write a longer description. Optional.}
14
+ spec.homepage = ""
15
+ spec.license = "MIT"
16
+
17
+ spec.files = %w(Gemfile README.md Rakefile)
18
+ spec.files << "#{lib}.gemspec"
19
+ spec.files += Dir.glob("lib/**/*.rb")
20
+
21
+ spec.add_runtime_dependency 'oj', '~> 2.8', '>= 2.8.1'
22
+ spec.add_runtime_dependency 'faraday', '~> 0.9', '>= 0.9.0'
23
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6.1'
24
+ spec.add_runtime_dependency 'rtesseract', '~> 1.2', '>= 1.2.1'
25
+ spec.add_runtime_dependency 'mini_magick', '~> 3.7', '>= 3.7.0'
26
+ spec.add_runtime_dependency 'addressable', '~> 2.3', '>= 2.3.6'
27
+ spec.add_runtime_dependency 'activesupport', '~> 4.1', '>= 4.1.0'
28
+
29
+ spec.add_development_dependency 'pry', '~> 0.9', '>= 0.9.12.6'
30
+ spec.add_development_dependency 'awesome_print', '~> 1.2', '>= 1.2.0'
31
+ end
metadata ADDED
@@ -0,0 +1,242 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mmonitor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Xu Fei
8
+ - Howl王
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-08-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: oj
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '2.8'
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 2.8.1
24
+ type: :runtime
25
+ prerelease: false
26
+ version_requirements: !ruby/object:Gem::Requirement
27
+ requirements:
28
+ - - "~>"
29
+ - !ruby/object:Gem::Version
30
+ version: '2.8'
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 2.8.1
34
+ - !ruby/object:Gem::Dependency
35
+ name: faraday
36
+ requirement: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.9'
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 0.9.0
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - "~>"
49
+ - !ruby/object:Gem::Version
50
+ version: '0.9'
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.9.0
54
+ - !ruby/object:Gem::Dependency
55
+ name: nokogiri
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.6'
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: 1.6.1
64
+ type: :runtime
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - "~>"
69
+ - !ruby/object:Gem::Version
70
+ version: '1.6'
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: 1.6.1
74
+ - !ruby/object:Gem::Dependency
75
+ name: rtesseract
76
+ requirement: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - "~>"
79
+ - !ruby/object:Gem::Version
80
+ version: '1.2'
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: 1.2.1
84
+ type: :runtime
85
+ prerelease: false
86
+ version_requirements: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '1.2'
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: 1.2.1
94
+ - !ruby/object:Gem::Dependency
95
+ name: mini_magick
96
+ requirement: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - "~>"
99
+ - !ruby/object:Gem::Version
100
+ version: '3.7'
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: 3.7.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.7'
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ version: 3.7.0
114
+ - !ruby/object:Gem::Dependency
115
+ name: addressable
116
+ requirement: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - "~>"
119
+ - !ruby/object:Gem::Version
120
+ version: '2.3'
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: 2.3.6
124
+ type: :runtime
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '2.3'
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: 2.3.6
134
+ - !ruby/object:Gem::Dependency
135
+ name: activesupport
136
+ requirement: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - "~>"
139
+ - !ruby/object:Gem::Version
140
+ version: '4.1'
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: 4.1.0
144
+ type: :runtime
145
+ prerelease: false
146
+ version_requirements: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - "~>"
149
+ - !ruby/object:Gem::Version
150
+ version: '4.1'
151
+ - - ">="
152
+ - !ruby/object:Gem::Version
153
+ version: 4.1.0
154
+ - !ruby/object:Gem::Dependency
155
+ name: pry
156
+ requirement: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - "~>"
159
+ - !ruby/object:Gem::Version
160
+ version: '0.9'
161
+ - - ">="
162
+ - !ruby/object:Gem::Version
163
+ version: 0.9.12.6
164
+ type: :development
165
+ prerelease: false
166
+ version_requirements: !ruby/object:Gem::Requirement
167
+ requirements:
168
+ - - "~>"
169
+ - !ruby/object:Gem::Version
170
+ version: '0.9'
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: 0.9.12.6
174
+ - !ruby/object:Gem::Dependency
175
+ name: awesome_print
176
+ requirement: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '1.2'
181
+ - - ">="
182
+ - !ruby/object:Gem::Version
183
+ version: 1.2.0
184
+ type: :development
185
+ prerelease: false
186
+ version_requirements: !ruby/object:Gem::Requirement
187
+ requirements:
188
+ - - "~>"
189
+ - !ruby/object:Gem::Version
190
+ version: '1.2'
191
+ - - ">="
192
+ - !ruby/object:Gem::Version
193
+ version: 1.2.0
194
+ description: Write a longer description. Optional.
195
+ email:
196
+ - xfstart07@gmail.com
197
+ - howl.wong@gmail.com
198
+ executables: []
199
+ extensions: []
200
+ extra_rdoc_files: []
201
+ files:
202
+ - Gemfile
203
+ - README.md
204
+ - Rakefile
205
+ - lib/mmonitor.rb
206
+ - lib/mmonitor/crawler.rb
207
+ - lib/mmonitor/parser.rb
208
+ - lib/mmonitor/spider.rb
209
+ - lib/mmonitor/strategies/amazon.rb
210
+ - lib/mmonitor/strategies/jd.rb
211
+ - lib/mmonitor/strategies/jumei.rb
212
+ - lib/mmonitor/strategies/lefeng.rb
213
+ - lib/mmonitor/strategies/suning.rb
214
+ - lib/mmonitor/strategies/tmall.rb
215
+ - lib/mmonitor/strategies/yhd.rb
216
+ - lib/mmonitor/strategies/yixun.rb
217
+ - mmonitor.gemspec
218
+ homepage: ''
219
+ licenses:
220
+ - MIT
221
+ metadata: {}
222
+ post_install_message:
223
+ rdoc_options: []
224
+ require_paths:
225
+ - lib
226
+ required_ruby_version: !ruby/object:Gem::Requirement
227
+ requirements:
228
+ - - ">="
229
+ - !ruby/object:Gem::Version
230
+ version: '0'
231
+ required_rubygems_version: !ruby/object:Gem::Requirement
232
+ requirements:
233
+ - - ">="
234
+ - !ruby/object:Gem::Version
235
+ version: '0'
236
+ requirements: []
237
+ rubyforge_project:
238
+ rubygems_version: 2.3.0
239
+ signing_key:
240
+ specification_version: 4
241
+ summary: Write a short summary. Required.
242
+ test_files: []