mmonitor 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: aa406deb1d049e0bb84bdb8c4cd55c534af658fa
4
+ data.tar.gz: 058830a5dbe7e114996a3a4c46148769e318d409
5
+ SHA512:
6
+ metadata.gz: 1da90674d7fcf473f2aac1a0f29882b1315ae80e2d6642cdf1954ef4263d8bed22b22ad3d499ae88176c9b51d694ff1d966da969f9438d4f815241b01e9a0824
7
+ data.tar.gz: bd655bbb204e52493ccd90491f006eda03803104268fd6dc945f6080d5e2f98d7f7c37e96cb17e76a7203f855c4d471ea1a7d3dfb3b6a257e9082a5de9ca2266
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawlers.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,77 @@
1
+ # MMonitor
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'mmonitor'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install mmonitor
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ``` ruby
24
+ # -*- encoding: utf-8 -*-
25
+ require 'mmonitor'
26
+ items = {}
27
+ url = 'http://loreal.tmall.com/category.htm?search=y&scene=taobao_shop'
28
+ a = MMonitor::Crawler.new(url)
29
+ items.merge!(a.items)
30
+
31
+ url = 'http://www.amazon.cn/s/ref=sr_nr_p_89_0?rh=i%3Aaps%2Ck%3A%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85%2Cp_89%3AL%27Oreal+Paris+%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&keywords=%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&ie=UTF8&qid=1397212889&rnid=125596071'
32
+ b = MMonitor::Crawler.new(url)
33
+ items.merge!(b.items)
34
+
35
+ url = 'http://search.jd.com/search?keyword=%E7%BE%8E%E5%AE%9D%E8%8E%B2&enc=utf-8&qr=&qrst=UNEXPAND&et=&rt=1&stop=1&area=1&wtype=1&ev=exbrand_%E7%BE%8E%E5%AE%9D%E8%8E%B2%EF%BC%88MAYBELLINE%EF%BC%89%40&uc=0#select'
36
+ c = MMonitor::Crawler.new(url)
37
+ items.merge!(c.items)
38
+
39
+ url = 'http://search.jumei.com/?filter=0-11-1&search=%E6%AC%A7%E8%8E%B1%E9%9B%85&from=search_topbar_%E6%AC%A7%E8%8E%B1%E9%9B%85_word_pos1&cat=&bid=1'
40
+ d = MMonitor::Crawler.new(url)
41
+ items.merge!(d.items)
42
+
43
+ url = 'http://search.lefeng.com/search/search?key=%E6%AC%A7%E8%8E%B1%E9%9B%85&wt.s_pg=Isearch&wt.s_pf=public'
44
+ e = MMonitor::Crawler.new(url)
45
+ items.merge!(e.items)
46
+
47
+ url = 'http://search.suning.com/%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85/cityId=9264&iy=-1&ct=1&si=5&st=0'
48
+ f = MMonitor::Crawler.new(url)
49
+ items.merge!(f.items)
50
+
51
+ url = 'http://www.yhd.com/ctg/s2/c0-0/b/a-s1-v0-p1-price-d0-f06-m1-rt0-pid-mid0-k%E6%AC%A7%E8%8E%B1%E9%9B%85/'
52
+ j = MMonitor::Crawler.new(url)
53
+ items.merge!(j.items)
54
+
55
+ url = 'http://searchex.yixun.com/html?key=%E6%AC%A7%E8%8E%B1%E9%9B%85&area=1&sort=0&show=0&size=40&pf=1&as=1&charset=utf-8&YTAG=1.100000401#list'
56
+ h = MMonitor::Crawler.new(url)
57
+ items.merge!(h.items)
58
+
59
+ require 'csv'
60
+ header_row = ['平台ID', 'SKU ID','商品名称', '平台', '原价', '销售价', '商品图片', '价格图片', '抓取日期']
61
+ CSV.open('测试数据.csv', "wb:GB18030", col_sep: ',') do |csv|
62
+ csv << header_row
63
+ items.each do |item_id, item|
64
+ csv << [
65
+ item_id,
66
+ item[:sku_id],
67
+ item[:title],
68
+ item[:provider],
69
+ item[:tag_price],
70
+ item[:price],
71
+ item[:photo_url],
72
+ item[:price_url],
73
+ Time.now.to_date.to_s
74
+ ]
75
+ end
76
+ end
77
+ ```
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/lib/mmonitor.rb ADDED
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'active_support/core_ext'
3
+
4
+ module MMonitor
5
+ VERSION = '0.0.2'
6
+
7
+ module Strategies # 目前支持的解析规则
8
+ autoload :Amazon, 'mmonitor/strategies/amazon'
9
+ autoload :Jd, 'mmonitor/strategies/jd'
10
+ autoload :Jumei, 'mmonitor/strategies/jumei'
11
+ autoload :Lefeng, 'mmonitor/strategies/lefeng'
12
+ autoload :Suning, 'mmonitor/strategies/suning'
13
+ autoload :Tmall, 'mmonitor/strategies/tmall'
14
+ autoload :Yhd, 'mmonitor/strategies/yhd'
15
+ autoload :Yixun, 'mmonitor/strategies/yixun'
16
+ end
17
+
18
+ autoload :Crawler, 'mmonitor/crawler'
19
+ autoload :Parser, 'mmonitor/parser'
20
+ autoload :Spider, 'mmonitor/spider'
21
+ end
@@ -0,0 +1,83 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'addressable/uri'
3
+
4
+ module MMonitor
5
+ class Crawler # 爬虫,负责数据抓取流程
6
+
7
+ attr_accessor :adapter, :url, :params, :pages, :page, :page_key, :total
8
+
9
+ def initialize(url)
10
+ self.page = 1
11
+ self.page_key = 'page'
12
+ process(url)
13
+ end
14
+
15
+ def items
16
+ self.adapter.items
17
+ end
18
+
19
+ private
20
+
21
+ # URL格式化
22
+ def process(uri)
23
+ uri = Addressable::URI.parse(uri)
24
+ host = uri.host
25
+ self.url = "#{uri.scheme}://#{host}#{uri.path}"
26
+ self.params = uri.query_values || {}
27
+
28
+ self.adapter = case
29
+ when host.include?('amazon.cn')
30
+ Strategies::Amazon.new(html)
31
+ when host.include?('jd.com')
32
+ Strategies::Jd.new(html)
33
+ when host.include?('jumei.com')
34
+ Strategies::Jumei.new(html)
35
+ when host.include?('lefeng.com')
36
+ Strategies::Lefeng.new(html)
37
+ when host.include?('suning.com')
38
+ Strategies::Suning.new(html)
39
+ when host.include?('tmall.com')
40
+ Strategies::Tmall.new(html)
41
+ when host.include?('yhd.com')
42
+ Strategies::Yhd.new(html, self.url)
43
+ when host.include?('yixun.com')
44
+ Strategies::Yixun.new(html)
45
+ else
46
+ puts host
47
+ puts '_'*88
48
+ nil
49
+ end
50
+ # 当前页数
51
+ self.pages = self.adapter.pages
52
+ self.total = self.adapter.total
53
+ self.page_key = self.adapter.page_key
54
+ next_page
55
+ end
56
+
57
+ def next_page
58
+ puts "分页提示:#{self.page}/#{self.pages}"
59
+ puts '_'*88
60
+ if self.pages > self.page
61
+ self.page += 1
62
+ self.params[self.page_key] = self.adapter.page(self.page)
63
+ self.adapter.body = html
64
+ self.adapter.process
65
+ else
66
+ self.adapter.extra
67
+ self.adapter.body = nil
68
+ self.adapter.item = nil
69
+ puts "产品差异:#{self.items.count}/#{self.total}" unless self.total.nil?
70
+ return nil
71
+ end
72
+ next_page
73
+ end
74
+
75
+ def html
76
+ if self.adapter.nil?
77
+ Spider.get_html(self.url, self.params)
78
+ else
79
+ self.adapter.get_html(self.url, self.params)
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,133 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+
5
+ class Parser # 解析器,负责网页模板解析。
6
+
7
+ attr_accessor :body, :items, :item
8
+
9
+ def initialize(dom)
10
+ return nil if dom.nil?
11
+ self.body = dom.at(css_path[:body])
12
+ self.items ||= {}
13
+ process unless self.body.nil?
14
+ end
15
+
16
+ def process
17
+ self.list.each do |item|
18
+ self.item = item
19
+ id = spu_id
20
+ unless id.nil?
21
+ self.items[id] = {
22
+ provider: self.provider,
23
+ title: title,
24
+ sku_id: sku_id,
25
+ photo_url: photo_url,
26
+ tag_price: tag_price,
27
+ price: price,
28
+ price_url: price_url
29
+ }
30
+ end
31
+ end
32
+ end
33
+
34
+ def list # 产品列表
35
+ products = []
36
+ if css_path[:list].is_a?(Array)
37
+ css_path[:list].each do |li|
38
+ dom = self.body.at(li)
39
+ products += dom.css(css_path[:item]) unless dom.nil?
40
+ end
41
+ else
42
+ products += self.body.at(css_path[:list]).css(css_path[:item])
43
+ end
44
+ products
45
+ end
46
+
47
+ def extra # 拓展操作
48
+
49
+ end
50
+
51
+ def pages
52
+ self.body.at(css_path[:pages]).text.to_i
53
+ end
54
+
55
+ def total
56
+ if css_path.has_key?(:total)
57
+ squish( self.body.at(css_path[:total]).try(:text) )
58
+ else
59
+ nil
60
+ end
61
+ end
62
+
63
+ def page_key
64
+ 'page'
65
+ end
66
+
67
+ def page(num)
68
+ num
69
+ end
70
+
71
+ def get_html(url, params)
72
+ Spider.get_html(url, params)
73
+ end
74
+
75
+ def spu_id # 产品ID
76
+ squish( self.item[css_path[:spu_id]] )
77
+ end
78
+
79
+ def sku_id # 单品ID
80
+ if css_path.has_key?(:sku_id)
81
+ squish( self.item[css_path[:sku_id]] )
82
+ else
83
+ nil
84
+ end
85
+ end
86
+
87
+ def title # 产品标题
88
+ squish( self.item.at(css_path[:title]).try(:text) )
89
+ end
90
+
91
+ def photo_url # 产品图片
92
+ self.item.at(css_path[:photo_url])['src']
93
+ end
94
+
95
+ def price # 产品售价
96
+ if css_path.has_key?(:price)
97
+ squish( self.item.at(css_path[:price]).try(:text) ).to_f
98
+ else
99
+ nil
100
+ end
101
+ end
102
+
103
+ def tag_price # 产品原价(可选)
104
+ if css_path.has_key?(:tag_price)
105
+ squish( self.item.at(css_path[:tag_price]).try(:text) ).to_f
106
+ else
107
+ nil
108
+ end
109
+ end
110
+
111
+ def price_url # 产品售价图片(可选)
112
+ if css_path.has_key?(:price_url)
113
+ self.item.at(css_path[:price_url])['src']
114
+ else
115
+ nil
116
+ end
117
+ end
118
+
119
+ private
120
+
121
+ # 字符处理
122
+ def squish(string)
123
+ string.gsub(/¥|¥/, '').strip.gsub(/\s+/, '') rescue nil
124
+ end
125
+
126
+ # 无货产品,通过图片获取价格
127
+ def pirce_from_ocr(price_url)
128
+ str = MMonitor::Spider.get_ocr(price_url)
129
+ str.match( /\d+\.\d+/ ).to_f
130
+ end
131
+ end
132
+
133
+ end
@@ -0,0 +1,80 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'oj'
3
+ require 'faraday'
4
+ require 'nokogiri'
5
+ require 'rtesseract'
6
+ require 'mini_magick'
7
+
8
+ module MMonitor
9
+ module Spider # 蜘蛛,负责http请求处理
10
+ class << self
11
+ # 抓取HTML
12
+ def get_html(url, params={})
13
+ body = get(url, params)
14
+ $body = body
15
+ ::Nokogiri::HTML(body)
16
+ end
17
+ # 抓取JSON
18
+ def get_json(url, params={})
19
+ body = get(url, params)
20
+ ::Oj.load(body) rescue {}
21
+ end
22
+ # 抓取图片上的文字
23
+ def get_ocr(photo_url)
24
+ image = MiniMagick::Image.open(photo_url)
25
+ image.combine_options do |c|
26
+ c.background '#FFFFFF'
27
+ c.colorspace 'GRAY'
28
+ c.alpha 'remove'
29
+ end
30
+ image.format 'jpg'
31
+ ocr = RTesseract.new(image.path, processor: 'mini_magick')
32
+ str = ocr.to_s
33
+ image.destroy!
34
+ return str
35
+ end
36
+ # 分页
37
+ def number_page(total, limit)
38
+ count = total / limit
39
+ count += 1 if total % limit > 0
40
+ count
41
+ end
42
+
43
+ private
44
+
45
+ def get(url, params={})
46
+ resp = conn.get url, params
47
+ # 根据状态返回/跳转/终止
48
+ case resp.status
49
+ when 200 # 正常
50
+ resp.body
51
+ when 302 # 跳转
52
+ get( resp.headers['location'] )
53
+ else
54
+ nil
55
+ end
56
+ end
57
+
58
+ # 连接
59
+ def conn
60
+ @conn ||= Faraday.new(ssl: false)
61
+ @conn.headers[:user_agent] = switcher
62
+ @conn.headers[:accept] = 'text/html,application/json;q=0.9'
63
+ @conn
64
+ end
65
+ # 混淆
66
+ def switcher
67
+ [
68
+ 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; 360se)',
69
+ 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)',
70
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
71
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C)',
72
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C)',
73
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; SE 2.X MetaSr 1.0)',
74
+ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.802.30 Safari/535.1 SE 2.X MetaSr 1.0',
75
+ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.3 (KHTML, like Gecko) Maxthon/3.3.2.1000 Chrome/16.0.883.0 Safari/535.3',
76
+ ].sample
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://www.amazon.cn/s/ref=sr_nr_p_89_0?rh=i%3Aaps%2Ck%3A%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85%2Cp_89%3AL%27Oreal+Paris+%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&keywords=%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85&ie=UTF8&qid=1397212889&rnid=125596071
6
+ class Amazon < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :amazon
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'div#rightResultsATF',
15
+ list: ['div#atfResults', 'div#btfResults'],
16
+ item: 'div.prod.celwidget',
17
+ total: '#resultCount > span',
18
+ pages: '#pagn > span.pagnDisabled',
19
+ spu_id: 'name',
20
+ title: 'h3.newaps>a',
21
+ photo_url: 'div.image.imageContainer > a > div > img',
22
+ tag_price: 'ul.rsltGridList > li.newp > div > a > del',
23
+ price: 'ul.rsltGridList > li.newp > div > a > span'
24
+ }
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,72 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://search.jd.com/search?keyword=%B0%CD%C0%E8%C5%B7%C0%B3%D1%C5&qr=&qrst=UNEXPAND&et=&rt=1&stop=1&ev=exbrand_%C5%B7%C0%B3%D1%C5%A3%A8LOREAL%A3%A9%40&area=1&wtype=1
6
+ class Jd < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :jd
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'body>div.w.main>div.right-extra',
15
+ list: 'div#plist>ul.list-h',
16
+ item: 'li',
17
+ total: 'div#filter>div.fore1>div.total>span>strong',
18
+ pages: 'div#filter>div.fore1>div#top_pagi>span',
19
+ spu_id: 'sku',
20
+ title: 'div>div.p-name>a',
21
+ photo_url: 'div>div.p-img >a>img'
22
+ }
23
+ end
24
+
25
+ def spu_id # 产品ID
26
+ id = squish( self.item[css_path[:spu_id]] )
27
+ id == 'scroll_loading' ? nil : id
28
+ end
29
+
30
+ def photo_url # 产品图片
31
+ self.item.at(css_path[:photo_url])['data-lazyload']
32
+ end
33
+
34
+ def pages
35
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
36
+ end
37
+
38
+ def extra
39
+ get_prices(self.items.keys)
40
+ end
41
+
42
+ private
43
+ # 设置价格
44
+ def set_prices(prices)
45
+ prices.each do |item|
46
+ id = item['id'].gsub('J_', '')
47
+ if self.items.has_key?(id) && item['m'].to_i > 0
48
+ self.items[id][:tag_price] = item['m'].to_f
49
+ self.items[id][:price] = item['p'].to_f
50
+ else
51
+ puts item['id']
52
+ puts '_'*88
53
+ end
54
+ end
55
+ end
56
+ # 获取价格
57
+ def get_prices(sku_ids)
58
+ url = "http://p.3.cn/prices/mgets"
59
+ sku_ids = sku_ids.map { |sku_id| "J_#{sku_id}" }
60
+ sku_ids.each_slice(100) do |skuids|
61
+ params = {
62
+ skuids: skuids.join(',')
63
+ }
64
+ prices = MMonitor::Spider.get_json(url, params)
65
+ unless prices.empty?
66
+ set_prices(prices)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,45 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://search.jumei.com/?filter=0-11-1&search=%E6%AC%A7%E8%8E%B1%E9%9B%85&from=search_topbar_%E6%AC%A7%E8%8E%B1%E9%9B%85_word_pos1&cat=&bid=1
6
+ class Jumei < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :jumei
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'body>div#container>div#body>div#search_result_wrap>div#search_list_wrap',
15
+ list: 'div.products_wrap>ul',
16
+ item: 'li',
17
+ total: 'div.search_list_head_fiex>div>div.head_pagecount>span',
18
+ pages: 'div.search_list_head_fiex>div>div.head_pageInfo',
19
+ spu_id: 'pid',
20
+ title: 'div>div.s_l_name>a',
21
+ price: 'div > div.search_list_price > span',
22
+ tag_price: 'div > div.search_list_price > del',
23
+ photo_url: 'div>div.s_l_pic>a>img'
24
+ }
25
+ end
26
+
27
+ def page_key
28
+ 'filter'
29
+ end
30
+
31
+ def pages
32
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
33
+ end
34
+
35
+ def page(num)
36
+ "0-11-#{num}"
37
+ end
38
+
39
+ def photo_url # 产品图片
40
+ self.item.at(css_path[:photo_url])['original']
41
+ end
42
+
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,82 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://search.lefeng.com/search/search?key=%E6%AC%A7%E8%8E%B1%E9%9B%85&wt.s_pg=Isearch&wt.s_pf=public
6
+ class Lefeng < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :lefeng
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'body>div.mainBg>div.main>div.cont>div#clothList',
15
+ list: 'div.smPruArea>div.makeup',
16
+ item: 'div.pruwrap',
17
+ total: 'div#sm-nav > span.tpages > span.tpageNum > i',
18
+ pages: 'div#sm-nav > span.tpages > span.tpageNum > em',
19
+ spu_id: 'id',
20
+ sku_id: 'skuids',
21
+ title: 'dl>dd.nam>a',
22
+ photo_url: 'dl>dt>a>img',
23
+ tag_price: 'dl>dd.pri>del',
24
+ price_url: 'dl>dd.pri>img',
25
+ }
26
+ end
27
+
28
+ def pages
29
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
30
+ end
31
+
32
+ def page_key
33
+ 'pageNo'
34
+ end
35
+
36
+ def price
37
+ price_from_cart(spu_id, sku_id) || pirce_from_ocr(price_url)
38
+ end
39
+
40
+ def sku_id
41
+ self.item.at('dl')[css_path[:sku_id]]
42
+ end
43
+
44
+ def photo_url # 产品图片
45
+ self.item.at(css_path[:photo_url])['pagespeed_lazy_src']
46
+ end
47
+
48
+ def price_url # 产品售价图片(可选)
49
+ self.item.at(css_path[:price_url])['pagespeed_lazy_src'].gsub('_73_', '_75_')
50
+ end
51
+
52
+ private
53
+ # 通过购物车,获取价格
54
+ def price_from_cart(spu_id, sku_id)
55
+ url = "http://shopping.lefeng.com/cart/ajaxUpsellcart.jsp"
56
+ params = {
57
+ 'productId' => spu_id,
58
+ 'skuId' => sku_id,
59
+ 'quantity' => 1,
60
+ 'itemType' => 0,
61
+ 'process' => 'add',
62
+ 'skuExtraPriceId' => 0,
63
+ 'fromSite' => 'webAllProvince',
64
+ '_' => 1398446675733
65
+ }
66
+ json = MMonitor::Spider.get_json(url, params)
67
+ if json["errorInfo"].blank?
68
+ return json['totalPrice'] if json['totalPrice'].is_a?(Float)
69
+ end
70
+ puts '将通过图片识别价格,很耗资源呢~'
71
+ puts '_'*88
72
+ nil
73
+ end
74
+ # 无货产品,通过图片获取价格
75
+ def pirce_from_ocr(price_url)
76
+ price_url.gsub!('_73_', '_75_')
77
+ str = MMonitor::Spider.get_ocr(price_url)
78
+ str.match( /\d+\.\d+/ )[0].to_f
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,69 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://search.suning.com/%E5%B7%B4%E9%BB%8E%E6%AC%A7%E8%8E%B1%E9%9B%85/cityId=9264&iy=-1&ct=1&si=5&st=0
6
+ class Suning < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :suning
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'body>div.wrap-search>div.w1190>div.proList.mb10',
15
+ list: 'div#productTab>div#proShow>ul.container',
16
+ item: 'li.item',
17
+ total: 'div#filterContent>div.searchKeyT>i',
18
+ pages: '#refresh>div.page>span>i#pageTotal',
19
+ spu_id: 'name',
20
+ sku_id: 'class',
21
+ title: 'div > span > a > p',
22
+ photo_url: 'a > img',
23
+ price_url: 'div > p > img'
24
+ }
25
+ end
26
+
27
+ def spu_id
28
+ self.item[css_path[:spu_id]].to_i
29
+ end
30
+
31
+ def sku_id
32
+ self.item[css_path[:sku_id]].split(' ')[0]
33
+ end
34
+
35
+ def price_url # 产品售价图片(可选)
36
+ dom = self.item.at(css_path[:price_url])
37
+ dom['src2'] unless dom.nil?
38
+ end
39
+
40
+ def price
41
+ price_from_ajax(sku_id) || pirce_from_ocr(price_url)
42
+ end
43
+
44
+ def page_key
45
+ 'cp'
46
+ end
47
+
48
+ def page(num)
49
+ num-1
50
+ end
51
+
52
+ def get_html(url, params)
53
+ uri = Addressable::URI.new
54
+ uri.query_values = params
55
+ Spider.get_html("#{url}&#{uri.query}")
56
+ end
57
+
58
+ private
59
+ # 浏览记录,获取价格
60
+ def price_from_ajax(sku_id)
61
+ url = "http://product.suning.com/0000000000/browseHistory_10052_10051_#{sku_id}_9264_.html"
62
+ json = MMonitor::Spider.get_json(url)
63
+ return json['price'].to_f if json.has_key?('price')
64
+ nil
65
+ end
66
+
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,39 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://loreal.tmall.com/category.htm?search=y&scene=taobao_shop
6
+ class Tmall < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :tmall
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: '#TmshopSrchNav>div',
15
+ list: 'div.J_TItems',
16
+ item: 'dl.item',
17
+ pages: 'div.filter.clearfix.J_TFilter > p > b.ui-page-s-len', # .text.split('/')[1]
18
+ spu_id: 'data-id',
19
+ title: 'dd.detail > a',
20
+ photo_url: 'dt > a:nth-child(1) > img', # 'data-ks-lazyload'
21
+ price: 'dd.detail > div > div.cprice-area > span.c-price'
22
+ }
23
+ end
24
+
25
+ def photo_url # 产品图片
26
+ self.item.at(css_path[:photo_url])['data-ks-lazyload']
27
+ end
28
+
29
+ def pages
30
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
31
+ end
32
+
33
+ def page_key
34
+ 'pageNo'
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,94 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://www.yhd.com/ctg/s2/c0-0/b/a-s1-v0-p1-price-d0-f06-m1-rt0-pid-mid0-k%E6%AC%A7%E8%8E%B1%E9%9B%85/
6
+ class Yhd < MMonitor::Parser
7
+
8
+ attr_accessor :body, :items, :item, :url, :pages_count, :page
9
+
10
+ def initialize(dom, url)
11
+ self.body = dom.at(css_path[:body])
12
+ self.items ||= {}
13
+ self.url = url.sub('s2', 'searchPage')
14
+ self.pages_count = self.body.at(css_path[:pages]).text.split('/')[1].to_i
15
+ self.page = 1
16
+ process
17
+ get_ajax(true)
18
+ next_page
19
+ end
20
+
21
+ # 输出的产品
22
+ def provider
23
+ :yhd
24
+ end
25
+
26
+ def css_path
27
+ {
28
+ body: '#plist',
29
+ list: '#itemSearchList',
30
+ item: 'li.search_item > div.search_item_box',
31
+ pages: '#rankOpDiv > div > div.select_page_num', # .text.split('/')[1]
32
+ spu_id: 'pmid',
33
+ title: 'p.title > a',
34
+ photo_url: 'a.search_prod_img > img', # 'data-ks-lazyload'
35
+ price: 'div.pricebox.clearfix > span:nth-child(1)'
36
+ }
37
+ end
38
+
39
+ def spu_id # 产品图片
40
+ self.item.at(css_path[:title])[css_path[:spu_id]]
41
+ end
42
+
43
+ def photo_url # 产品图片
44
+ img = self.item.at(css_path[:photo_url])
45
+ img['original'] || img['src']
46
+ end
47
+
48
+ # 分页伪装
49
+ def pages
50
+ 1
51
+ end
52
+
53
+ private
54
+
55
+ def next_page
56
+ puts "分页提示:#{self.page}/#{self.pages_count}"
57
+ puts '_'*88
58
+ if self.pages_count > self.page
59
+ self.page += 1
60
+ self.url.gsub!("-p#{self.page-1}-", "-p#{self.page}-")
61
+ get_ajax # 执行
62
+ get_ajax(true)
63
+ else
64
+ self.body = nil
65
+ self.item = nil
66
+ return nil
67
+ end
68
+ next_page
69
+ end
70
+
71
+ def more_params
72
+ {
73
+ 'isGetMoreProducts' => 1,
74
+ 'moreProductsDefaultTemplate' => 0
75
+ }
76
+ end
77
+
78
+ # 加载更多商品
79
+ def get_ajax(more=false)
80
+ params = more ? more_params : {}
81
+ json = MMonitor::Spider.get_json(self.url, params)
82
+ if json.has_key?('value')
83
+ html = json['value']
84
+ unless html.nil?
85
+ self.body = ::Nokogiri::HTML("<div id='itemSearchList'>#{html}</div>")
86
+ # 执行
87
+ self.process
88
+ end
89
+ end
90
+ end
91
+
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,36 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module MMonitor
4
+ module Strategies
5
+ # http://searchex.yixun.com/html?key=%E6%AC%A7%E8%8E%B1%E9%9B%85&area=1&sort=0&show=0&size=40&pf=1&as=1&charset=utf-8&YTAG=1.100000401#list
6
+ class Yixun < MMonitor::Parser
7
+ # 输出的产品
8
+ def provider
9
+ :yixun
10
+ end
11
+
12
+ def css_path
13
+ {
14
+ body: 'div#container > div > div.grid_m.smain > div.grid_m_inner',
15
+ list: 'div.goods > ul#itemList',
16
+ item: 'li.goods_li',
17
+ total: '#list > div.sort_page > div.sort_page_txt > b',
18
+ pages: '#list > div.sort_page > div.sort_page_num > span', # .text.split('/')[1]
19
+ spu_id: 'commid',
20
+ title: 'div > div.mod_goods_info > p.mod_goods_tit > a',
21
+ photo_url: 'div > div.mod_goods_img > a > img', # 'data-ks-lazyload'
22
+ price: 'div > div.mod_goods_info > p.mod_goods_price > span.mod_price > span'
23
+ }
24
+ end
25
+
26
+ def photo_url # 产品图片
27
+ self.item.at(css_path[:photo_url])['init_src']
28
+ end
29
+
30
+ def pages
31
+ self.body.at(css_path[:pages]).text.split('/')[1].to_i
32
+ end
33
+
34
+ end
35
+ end
36
+ end
data/mmonitor.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = "mmonitor"
3
+ lib_file = File.expand_path("../lib/#{lib}.rb", __FILE__)
4
+ File.read(lib_file) =~ /\bVERSION\s*=\s*["'](.+?)["']/
5
+ version = $1
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = lib
9
+ spec.version = version
10
+ spec.authors = ["Xu Fei", "Howl王"]
11
+ spec.email = ["xfstart07@gmail.com", "howl.wong@gmail.com"]
12
+ spec.summary = %q{Write a short summary. Required.}
13
+ spec.description = %q{Write a longer description. Optional.}
14
+ spec.homepage = ""
15
+ spec.license = "MIT"
16
+
17
+ spec.files = %w(Gemfile README.md Rakefile)
18
+ spec.files << "#{lib}.gemspec"
19
+ spec.files += Dir.glob("lib/**/*.rb")
20
+
21
+ spec.add_runtime_dependency 'oj', '~> 2.8', '>= 2.8.1'
22
+ spec.add_runtime_dependency 'faraday', '~> 0.9', '>= 0.9.0'
23
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6.1'
24
+ spec.add_runtime_dependency 'rtesseract', '~> 1.2', '>= 1.2.1'
25
+ spec.add_runtime_dependency 'mini_magick', '~> 3.7', '>= 3.7.0'
26
+ spec.add_runtime_dependency 'addressable', '~> 2.3', '>= 2.3.6'
27
+ spec.add_runtime_dependency 'activesupport', '~> 4.1', '>= 4.1.0'
28
+
29
+ spec.add_development_dependency 'pry', '~> 0.9', '>= 0.9.12.6'
30
+ spec.add_development_dependency 'awesome_print', '~> 1.2', '>= 1.2.0'
31
+ end
metadata ADDED
@@ -0,0 +1,242 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mmonitor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Xu Fei
8
+ - Howl王
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-08-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: oj
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '2.8'
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 2.8.1
24
+ type: :runtime
25
+ prerelease: false
26
+ version_requirements: !ruby/object:Gem::Requirement
27
+ requirements:
28
+ - - "~>"
29
+ - !ruby/object:Gem::Version
30
+ version: '2.8'
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 2.8.1
34
+ - !ruby/object:Gem::Dependency
35
+ name: faraday
36
+ requirement: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.9'
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 0.9.0
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - "~>"
49
+ - !ruby/object:Gem::Version
50
+ version: '0.9'
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.9.0
54
+ - !ruby/object:Gem::Dependency
55
+ name: nokogiri
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.6'
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: 1.6.1
64
+ type: :runtime
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - "~>"
69
+ - !ruby/object:Gem::Version
70
+ version: '1.6'
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: 1.6.1
74
+ - !ruby/object:Gem::Dependency
75
+ name: rtesseract
76
+ requirement: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - "~>"
79
+ - !ruby/object:Gem::Version
80
+ version: '1.2'
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: 1.2.1
84
+ type: :runtime
85
+ prerelease: false
86
+ version_requirements: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '1.2'
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: 1.2.1
94
+ - !ruby/object:Gem::Dependency
95
+ name: mini_magick
96
+ requirement: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - "~>"
99
+ - !ruby/object:Gem::Version
100
+ version: '3.7'
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: 3.7.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.7'
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ version: 3.7.0
114
+ - !ruby/object:Gem::Dependency
115
+ name: addressable
116
+ requirement: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - "~>"
119
+ - !ruby/object:Gem::Version
120
+ version: '2.3'
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: 2.3.6
124
+ type: :runtime
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '2.3'
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: 2.3.6
134
+ - !ruby/object:Gem::Dependency
135
+ name: activesupport
136
+ requirement: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - "~>"
139
+ - !ruby/object:Gem::Version
140
+ version: '4.1'
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: 4.1.0
144
+ type: :runtime
145
+ prerelease: false
146
+ version_requirements: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - "~>"
149
+ - !ruby/object:Gem::Version
150
+ version: '4.1'
151
+ - - ">="
152
+ - !ruby/object:Gem::Version
153
+ version: 4.1.0
154
+ - !ruby/object:Gem::Dependency
155
+ name: pry
156
+ requirement: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - "~>"
159
+ - !ruby/object:Gem::Version
160
+ version: '0.9'
161
+ - - ">="
162
+ - !ruby/object:Gem::Version
163
+ version: 0.9.12.6
164
+ type: :development
165
+ prerelease: false
166
+ version_requirements: !ruby/object:Gem::Requirement
167
+ requirements:
168
+ - - "~>"
169
+ - !ruby/object:Gem::Version
170
+ version: '0.9'
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: 0.9.12.6
174
+ - !ruby/object:Gem::Dependency
175
+ name: awesome_print
176
+ requirement: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '1.2'
181
+ - - ">="
182
+ - !ruby/object:Gem::Version
183
+ version: 1.2.0
184
+ type: :development
185
+ prerelease: false
186
+ version_requirements: !ruby/object:Gem::Requirement
187
+ requirements:
188
+ - - "~>"
189
+ - !ruby/object:Gem::Version
190
+ version: '1.2'
191
+ - - ">="
192
+ - !ruby/object:Gem::Version
193
+ version: 1.2.0
194
+ description: Write a longer description. Optional.
195
+ email:
196
+ - xfstart07@gmail.com
197
+ - howl.wong@gmail.com
198
+ executables: []
199
+ extensions: []
200
+ extra_rdoc_files: []
201
+ files:
202
+ - Gemfile
203
+ - README.md
204
+ - Rakefile
205
+ - lib/mmonitor.rb
206
+ - lib/mmonitor/crawler.rb
207
+ - lib/mmonitor/parser.rb
208
+ - lib/mmonitor/spider.rb
209
+ - lib/mmonitor/strategies/amazon.rb
210
+ - lib/mmonitor/strategies/jd.rb
211
+ - lib/mmonitor/strategies/jumei.rb
212
+ - lib/mmonitor/strategies/lefeng.rb
213
+ - lib/mmonitor/strategies/suning.rb
214
+ - lib/mmonitor/strategies/tmall.rb
215
+ - lib/mmonitor/strategies/yhd.rb
216
+ - lib/mmonitor/strategies/yixun.rb
217
+ - mmonitor.gemspec
218
+ homepage: ''
219
+ licenses:
220
+ - MIT
221
+ metadata: {}
222
+ post_install_message:
223
+ rdoc_options: []
224
+ require_paths:
225
+ - lib
226
+ required_ruby_version: !ruby/object:Gem::Requirement
227
+ requirements:
228
+ - - ">="
229
+ - !ruby/object:Gem::Version
230
+ version: '0'
231
+ required_rubygems_version: !ruby/object:Gem::Requirement
232
+ requirements:
233
+ - - ">="
234
+ - !ruby/object:Gem::Version
235
+ version: '0'
236
+ requirements: []
237
+ rubyforge_project:
238
+ rubygems_version: 2.3.0
239
+ signing_key:
240
+ specification_version: 4
241
+ summary: Write a short summary. Required.
242
+ test_files: []