crowd_funding_parser 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.travis.yml +17 -0
- data/Gemfile +2 -0
- data/Guardfile +11 -0
- data/README.md +2 -0
- data/Rakefile +8 -0
- data/crowd_funding_parser.gemspec +12 -0
- data/lib/crowd_funding_parser.rb +4 -0
- data/lib/crowd_funding_parser/general.rb +63 -53
- data/lib/crowd_funding_parser/method_builder.rb +47 -0
- data/lib/crowd_funding_parser/parser/an9.rb +196 -0
- data/lib/crowd_funding_parser/parser/flyingv.rb +64 -64
- data/lib/crowd_funding_parser/parser/hereo.rb +99 -0
- data/lib/crowd_funding_parser/parser/kickstarter.rb +158 -0
- data/lib/crowd_funding_parser/parser/taobao.rb +150 -0
- data/lib/crowd_funding_parser/parser/webackers.rb +73 -66
- data/lib/crowd_funding_parser/parser/zeczec.rb +79 -60
- data/lib/crowd_funding_parser/version.rb +1 -1
- data/spec/fixtures/vcr_cassettes/flyingv.yml +4869 -0
- data/spec/fixtures/vcr_cassettes/zeczec.yml +791 -0
- data/spec/parsers/flyingv_spec.rb +62 -0
- data/spec/parsers/zezec_spec.rb +62 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/support/vcr_sites.rb +8 -0
- metadata +191 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a49010d36a36cab50f0c001bb1d368934c1126a3
|
4
|
+
data.tar.gz: 177a84fa3773c046fd6e4488c3470aae83c25cef
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e3129df45328fb30f828ce77de424813926dae5d169da8ca66e0e3c55dcf5f5590e02bebf2810be137c78d2e2fbe59b251250b453e39c064678a9422045392a
|
7
|
+
data.tar.gz: 028b83e89377e5c631893be152b902e4f6430c187d43d5fedc5e4e18b67d0dfdfc80417c0d66e39d25a9026f51ef5432bc0cdd3d188fe6e376b680e3d4fb94aa
|
data/.gitignore
CHANGED
data/.travis.yml
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
language: ruby
|
2
|
+
cache: bundler
|
3
|
+
|
4
|
+
rvm:
|
5
|
+
- 2.1.2
|
6
|
+
|
7
|
+
script: 'bundle exec rspec'
|
8
|
+
|
9
|
+
notifications:
|
10
|
+
slack:
|
11
|
+
rooms:
|
12
|
+
- backer-founder:gS9GQrOqRKRogUEU5ajHYFZ9#crowd-trail
|
13
|
+
email:
|
14
|
+
recipients:
|
15
|
+
- stan001212@gmail.com
|
16
|
+
on_failure: change
|
17
|
+
on_success: never
|
data/Gemfile
CHANGED
data/Guardfile
ADDED
data/README.md
CHANGED
data/Rakefile
CHANGED
@@ -1,2 +1,10 @@
|
|
1
|
+
require "rspec/core/rake_task"
|
1
2
|
require "bundler/gem_tasks"
|
3
|
+
# Default directory to look in is `/specs`
|
4
|
+
# Run with `rake spec`
|
5
|
+
RSpec::Core::RakeTask.new(:spec) do |task|
|
6
|
+
task.rspec_opts = ['--color', '--format', 'nested']
|
7
|
+
end
|
8
|
+
|
9
|
+
task default: :spec
|
2
10
|
|
@@ -18,8 +18,20 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
+
spec.add_development_dependency "webmock"
|
22
|
+
spec.add_development_dependency "iconv"
|
23
|
+
spec.add_development_dependency "activesupport"
|
24
|
+
spec.add_development_dependency "vcr", "~> 2.9.3"
|
25
|
+
spec.add_development_dependency "rspec"
|
26
|
+
spec.add_development_dependency "rspec-nc"
|
27
|
+
spec.add_development_dependency "guard"
|
28
|
+
spec.add_development_dependency "guard-rspec"
|
29
|
+
spec.add_development_dependency "pry"
|
30
|
+
spec.add_development_dependency "pry-remote"
|
31
|
+
spec.add_development_dependency "pry-nav"
|
21
32
|
spec.add_development_dependency "bundler", "~> 1.6"
|
22
33
|
spec.add_development_dependency "rake", "~> 10.0"
|
23
34
|
spec.add_runtime_dependency "parallel", "~> 1.3"
|
24
35
|
spec.add_runtime_dependency "nokogiri", "~> 1.6"
|
36
|
+
spec.add_runtime_dependency "httparty", "~> 0.13.3"
|
25
37
|
end
|
data/lib/crowd_funding_parser.rb
CHANGED
@@ -4,3 +4,7 @@ require "crowd_funding_parser/general"
|
|
4
4
|
require "crowd_funding_parser/parser/flyingv"
|
5
5
|
require "crowd_funding_parser/parser/webackers"
|
6
6
|
require "crowd_funding_parser/parser/zeczec"
|
7
|
+
require "crowd_funding_parser/parser/taobao"
|
8
|
+
require "crowd_funding_parser/parser/kickstarter"
|
9
|
+
require "crowd_funding_parser/parser/hereo"
|
10
|
+
require "crowd_funding_parser/parser/an9"
|
@@ -1,50 +1,53 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require "active_support/all"
|
3
|
+
require_relative "method_builder"
|
4
|
+
|
1
5
|
module CrowdFundingParser
|
2
6
|
module Parser
|
3
7
|
class General
|
4
|
-
|
5
|
-
|
8
|
+
include HTTParty
|
9
|
+
|
10
|
+
def parse_tracking_data(result, project_url)
|
6
11
|
project = Hash.new
|
7
|
-
project[
|
8
|
-
project[
|
9
|
-
project[
|
10
|
-
project[
|
11
|
-
project[
|
12
|
-
|
13
|
-
project[
|
14
|
-
project['following_count'] = get_following_count(doc).to_i
|
12
|
+
project["money_goal"] = get_money_goal(result).to_i
|
13
|
+
project["money_pledged"] = get_money_pledged(result).to_i
|
14
|
+
project["backer_count"] = get_backer_count(result).to_i
|
15
|
+
project["left_time"] = get_left_time(result)
|
16
|
+
project["status"] = get_status(project["left_time"])
|
17
|
+
project["fb_count"] = get_fb_count(result).to_i
|
18
|
+
project["following_count"] = get_following_count(result).to_i
|
15
19
|
project
|
16
20
|
end
|
17
21
|
|
18
|
-
def parse_content_data(
|
19
|
-
|
20
|
-
project
|
21
|
-
project[
|
22
|
-
project[
|
23
|
-
project[
|
24
|
-
project[
|
25
|
-
project[
|
26
|
-
project[
|
27
|
-
project[
|
28
|
-
project[
|
22
|
+
def parse_content_data(result, project_url)
|
23
|
+
project = Hash.new
|
24
|
+
project["platform_project_id"] = get_id(project_url)
|
25
|
+
project["title"] = get_title(result)
|
26
|
+
project["url"] = project_url
|
27
|
+
project["summary"] = get_summary(result)
|
28
|
+
project["category"] = get_category(result)
|
29
|
+
project["creator_name"] = get_creator_name(result)
|
30
|
+
project["creator_id"] = get_creator_id(result)
|
31
|
+
project["creator_link"] = get_creator_link(result)
|
32
|
+
project["currency_string"] = get_currency_string(result)
|
33
|
+
project["start_date"] = get_start_date(result)
|
34
|
+
project["end_date"] = get_end_date(result)
|
35
|
+
project["region"] = get_region(result)
|
29
36
|
project
|
30
37
|
end
|
31
38
|
|
32
39
|
def get_project_links(required_status = "online")
|
33
40
|
links = []
|
34
|
-
|
35
|
-
|
41
|
+
|
42
|
+
get_lists.each do |target|
|
36
43
|
doc = Nokogiri::HTML(target)
|
37
44
|
online_projects = doc.css(@item_css_class)
|
38
45
|
|
39
46
|
Parallel.map(online_projects, in_processes: 2 , in_threads: 4) do |project|
|
40
47
|
link_nodes = project.css("a:nth-child(1)")
|
41
48
|
status = get_status(get_string(project.css(@status_css_class)))
|
42
|
-
link = link_nodes.first["href"]
|
43
|
-
if status ==
|
44
|
-
links << link
|
45
|
-
elsif status == "online" && required_status == "online"
|
46
|
-
links << link
|
47
|
-
elsif status == "preparing" && required_status == "preparing"
|
49
|
+
link = @platform_url + link_nodes.first["href"]
|
50
|
+
if status == required_status
|
48
51
|
links << link
|
49
52
|
end
|
50
53
|
end
|
@@ -53,42 +56,49 @@ module CrowdFundingParser
|
|
53
56
|
links
|
54
57
|
end
|
55
58
|
|
56
|
-
def
|
57
|
-
|
58
|
-
|
59
|
+
def get_result(project_url)
|
60
|
+
if @parse_method == :json
|
61
|
+
project_id = get_id(project_url)
|
62
|
+
project_api = get_project_api(project_id)
|
63
|
+
get_json_through_url(project_api)
|
64
|
+
else
|
65
|
+
get_doc_through_url(project_url)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def get_doc_through_url(project_url)
|
70
|
+
project_html = HTTParty.get(project_url)
|
71
|
+
Nokogiri::HTML(project_html)
|
59
72
|
end
|
60
73
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
74
|
+
def get_json_through_url(project_url)
|
75
|
+
httparty_url = HTTParty.get(project_url)
|
76
|
+
json = JSON.load(httparty_url.body)
|
64
77
|
end
|
65
78
|
|
66
|
-
def
|
67
|
-
|
79
|
+
def get_project(project_url)
|
80
|
+
result = get_result(project_url)
|
81
|
+
parse_content_data(result, project_url).merge parse_tracking_data(result, project_url)
|
68
82
|
end
|
69
83
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
Nokogiri::HTML(project_html)
|
84
|
+
def get_id(project_url)
|
85
|
+
rel_url = get_rel_url(project_url)
|
86
|
+
rel_url.split("/").last.split("?").first
|
74
87
|
end
|
75
88
|
|
76
89
|
private
|
77
90
|
|
78
|
-
def
|
79
|
-
|
91
|
+
def get_rel_url(url)
|
92
|
+
url.gsub("#{@platform_url}", "")
|
80
93
|
end
|
81
94
|
|
82
|
-
def
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
hours = ((left_time / (60 * 60)) % 24).to_i
|
89
|
-
minutes = ((left_time / 60) % 60).to_i
|
90
|
-
"#{days}天#{hours}小時#{minutes}分鐘"
|
95
|
+
def method_missing(m, *args, &block)
|
96
|
+
if m.to_s.match(/get/)
|
97
|
+
""
|
98
|
+
else
|
99
|
+
super
|
100
|
+
end
|
91
101
|
end
|
92
102
|
end
|
93
103
|
end
|
94
|
-
end
|
104
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module MethodBuilder
|
2
|
+
def self.set_methods(&block)
|
3
|
+
ParserMethodProxy.new.instance_eval(&block)
|
4
|
+
end
|
5
|
+
|
6
|
+
class ParserMethodProxy
|
7
|
+
def insert_parser(inserted_class)
|
8
|
+
@parser_class = "CrowdFundingParser::Parser::#{inserted_class}".constantize
|
9
|
+
@parser = @parser_class.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def set_variable(&block)
|
13
|
+
block.call
|
14
|
+
end
|
15
|
+
|
16
|
+
def set_method(method_name, &block)
|
17
|
+
@parser_class.send(:define_method, method_name) do |arg|
|
18
|
+
begin
|
19
|
+
block.call(arg)
|
20
|
+
rescue Exception => e
|
21
|
+
puts "Error #{e.message}"
|
22
|
+
puts e.backtrace.first
|
23
|
+
""
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def method_missing(m, *args, &block)
|
29
|
+
""
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_string(elements)
|
33
|
+
elements.first.text.strip
|
34
|
+
end
|
35
|
+
|
36
|
+
def money_string(money)
|
37
|
+
money.gsub("$","").gsub(",", "").gsub("NT", "")
|
38
|
+
end
|
39
|
+
|
40
|
+
def convert_time(left_time)
|
41
|
+
days = ((left_time / (60 * 60 * 24))).to_i
|
42
|
+
hours = ((left_time / (60 * 60)) % 24).to_i
|
43
|
+
minutes = ((left_time / 60) % 60).to_i
|
44
|
+
"#{days}天#{hours}小時#{minutes}分鐘"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
require "json"
|
2
|
+
require 'open-uri'
|
3
|
+
require "iconv"
|
4
|
+
|
5
|
+
module CrowdFundingParser
|
6
|
+
module Parser
|
7
|
+
class An9 < General
|
8
|
+
def initialize
|
9
|
+
@url = "http://www.an9.com.tw/Dream/"
|
10
|
+
@status_css_class = ".sideCon>a"
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_main_categories(add_categories)
|
14
|
+
add_categories.select { |c| c[:parent_id].nil? }
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_project_links(required_status = "online")
|
18
|
+
links = []
|
19
|
+
error_count = 0
|
20
|
+
not_found_count = 0
|
21
|
+
Parallel.each(1..100000, in_precesses: 2, in_threads: 5, progress: "Get #{self} links") do |i|
|
22
|
+
begin
|
23
|
+
link = @url + i.to_s
|
24
|
+
project = get_doc_through_url(link)
|
25
|
+
not_found_message = project.css(".actMsg p")
|
26
|
+
if not_found_message.present? && get_string(not_found_message).match(/不存在/)
|
27
|
+
not_found_count += 1
|
28
|
+
else
|
29
|
+
status = get_status(get_string(project.css(@status_css_class)))
|
30
|
+
|
31
|
+
if status == required_status
|
32
|
+
links << link
|
33
|
+
end
|
34
|
+
not_found_count = 0
|
35
|
+
error_count = 0
|
36
|
+
end
|
37
|
+
rescue Exception => e
|
38
|
+
error_count += 1
|
39
|
+
raise Parallel::Break if not_found_count >= 50 || error_count >= 50
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
links
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def get_title(result)
|
49
|
+
if @parse_method == :doc
|
50
|
+
get_string(result.css(".NS_projects__header h2 .green-dark"))
|
51
|
+
else
|
52
|
+
result["name"]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def get_category(result)
|
57
|
+
if @parse_method == :doc
|
58
|
+
get_string(result.css(".container-flex .h5 a.grey-dark:nth-child(2) b"))
|
59
|
+
else
|
60
|
+
result["category"]["name"]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_creator_name(result)
|
65
|
+
if @parse_method == :doc
|
66
|
+
get_string(result.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog"))
|
67
|
+
else
|
68
|
+
result["creator"]["name"]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_creator_id(result)
|
73
|
+
if @parse_method == :doc
|
74
|
+
creator_link = result.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog").first["href"]
|
75
|
+
creator_link.split("/")[-3]
|
76
|
+
else
|
77
|
+
result["creator"]["id"]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def get_creator_link(result)
|
82
|
+
if @parse_method == :doc
|
83
|
+
creator_link = @url + result.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog").first["href"]
|
84
|
+
else
|
85
|
+
result["creator"]["urls"]["web"]["user"]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def get_summary(result)
|
90
|
+
if @parse_method == :doc
|
91
|
+
get_string(result.css(".container-flex .col-8 .mobile-hide p.h3.mb3"))
|
92
|
+
else
|
93
|
+
result["blurb"]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def get_start_date(result)
|
98
|
+
if @parse_method == :doc
|
99
|
+
# no start date on page
|
100
|
+
else
|
101
|
+
Time.at(result["launched_at"])
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_end_date(result)
|
106
|
+
if @parse_method == :doc
|
107
|
+
result.css(".NS_projects__deadline_copy p.grey-dark time[datetime]")[0]["datetime"]
|
108
|
+
else
|
109
|
+
time = Time.at(result["deadline"])
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def get_region(result)
|
114
|
+
if @parse_method == :doc
|
115
|
+
get_string(result.css(".container-flex .h5 a.grey-dark:nth-child(1) b"))
|
116
|
+
else
|
117
|
+
result["location"]["displayable_name"]
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# for tracking
|
122
|
+
|
123
|
+
def get_money_goal(result)
|
124
|
+
if @parse_method == :doc
|
125
|
+
result.css("div[data-pledged]").first["data-goal"]
|
126
|
+
else
|
127
|
+
result["goal"]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def get_money_pledged(result)
|
132
|
+
if @parse_method == :doc
|
133
|
+
result.css("div[data-pledged]").first["data-pledged"]
|
134
|
+
else
|
135
|
+
result["pledged"]
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def get_backer_count(result)
|
140
|
+
if @parse_method == :doc
|
141
|
+
result.css("div[data-backers-count]").first["data-backers-count"]
|
142
|
+
else
|
143
|
+
result["backers_count"]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def get_left_time(result)
|
148
|
+
if @parse_method == :doc
|
149
|
+
end_date = result.css("div[data-end_time]").first["data-end_time"]
|
150
|
+
last_seconds = Time.parse(end_date) - Time.now
|
151
|
+
else
|
152
|
+
last_seconds = result["deadline"].to_i - Time.now.to_i
|
153
|
+
end
|
154
|
+
last_day = last_seconds.to_i / 86400
|
155
|
+
if last_day <= 0
|
156
|
+
"已結束"
|
157
|
+
else
|
158
|
+
last_day.to_s + "天"
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def get_status(button_text)
|
163
|
+
case button_text
|
164
|
+
when /贊助/
|
165
|
+
"online"
|
166
|
+
when /喜歡/
|
167
|
+
"voting"
|
168
|
+
when /結束|成功/
|
169
|
+
"finished"
|
170
|
+
else
|
171
|
+
"online"
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def get_fb_count(result)
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
def get_following_count(result)
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
def get_backer_list(project_url)
|
184
|
+
[]
|
185
|
+
end
|
186
|
+
|
187
|
+
def get_currency_string(result)
|
188
|
+
if @parse_method == :doc
|
189
|
+
result.css("data[data-currency]")[0]["data-currency"]
|
190
|
+
else
|
191
|
+
result["currency"]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|