crowd_funding_parser 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.travis.yml +17 -0
- data/Gemfile +2 -0
- data/Guardfile +11 -0
- data/README.md +2 -0
- data/Rakefile +8 -0
- data/crowd_funding_parser.gemspec +12 -0
- data/lib/crowd_funding_parser.rb +4 -0
- data/lib/crowd_funding_parser/general.rb +63 -53
- data/lib/crowd_funding_parser/method_builder.rb +47 -0
- data/lib/crowd_funding_parser/parser/an9.rb +196 -0
- data/lib/crowd_funding_parser/parser/flyingv.rb +64 -64
- data/lib/crowd_funding_parser/parser/hereo.rb +99 -0
- data/lib/crowd_funding_parser/parser/kickstarter.rb +158 -0
- data/lib/crowd_funding_parser/parser/taobao.rb +150 -0
- data/lib/crowd_funding_parser/parser/webackers.rb +73 -66
- data/lib/crowd_funding_parser/parser/zeczec.rb +79 -60
- data/lib/crowd_funding_parser/version.rb +1 -1
- data/spec/fixtures/vcr_cassettes/flyingv.yml +4869 -0
- data/spec/fixtures/vcr_cassettes/zeczec.yml +791 -0
- data/spec/parsers/flyingv_spec.rb +62 -0
- data/spec/parsers/zezec_spec.rb +62 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/support/vcr_sites.rb +8 -0
- metadata +191 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a49010d36a36cab50f0c001bb1d368934c1126a3
|
4
|
+
data.tar.gz: 177a84fa3773c046fd6e4488c3470aae83c25cef
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e3129df45328fb30f828ce77de424813926dae5d169da8ca66e0e3c55dcf5f5590e02bebf2810be137c78d2e2fbe59b251250b453e39c064678a9422045392a
|
7
|
+
data.tar.gz: 028b83e89377e5c631893be152b902e4f6430c187d43d5fedc5e4e18b67d0dfdfc80417c0d66e39d25a9026f51ef5432bc0cdd3d188fe6e376b680e3d4fb94aa
|
data/.gitignore
CHANGED
data/.travis.yml
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
language: ruby
|
2
|
+
cache: bundler
|
3
|
+
|
4
|
+
rvm:
|
5
|
+
- 2.1.2
|
6
|
+
|
7
|
+
script: 'bundle exec rspec'
|
8
|
+
|
9
|
+
notifications:
|
10
|
+
slack:
|
11
|
+
rooms:
|
12
|
+
- backer-founder:gS9GQrOqRKRogUEU5ajHYFZ9#crowd-trail
|
13
|
+
email:
|
14
|
+
recipients:
|
15
|
+
- stan001212@gmail.com
|
16
|
+
on_failure: change
|
17
|
+
on_success: never
|
data/Gemfile
CHANGED
data/Guardfile
ADDED
data/README.md
CHANGED
data/Rakefile
CHANGED
@@ -1,2 +1,10 @@
|
|
1
|
+
require "rspec/core/rake_task"
|
1
2
|
require "bundler/gem_tasks"
|
3
|
+
# Default directory to look in is `/specs`
|
4
|
+
# Run with `rake spec`
|
5
|
+
RSpec::Core::RakeTask.new(:spec) do |task|
|
6
|
+
task.rspec_opts = ['--color', '--format', 'nested']
|
7
|
+
end
|
8
|
+
|
9
|
+
task default: :spec
|
2
10
|
|
@@ -18,8 +18,20 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
+
spec.add_development_dependency "webmock"
|
22
|
+
spec.add_development_dependency "iconv"
|
23
|
+
spec.add_development_dependency "activesupport"
|
24
|
+
spec.add_development_dependency "vcr", "~> 2.9.3"
|
25
|
+
spec.add_development_dependency "rspec"
|
26
|
+
spec.add_development_dependency "rspec-nc"
|
27
|
+
spec.add_development_dependency "guard"
|
28
|
+
spec.add_development_dependency "guard-rspec"
|
29
|
+
spec.add_development_dependency "pry"
|
30
|
+
spec.add_development_dependency "pry-remote"
|
31
|
+
spec.add_development_dependency "pry-nav"
|
21
32
|
spec.add_development_dependency "bundler", "~> 1.6"
|
22
33
|
spec.add_development_dependency "rake", "~> 10.0"
|
23
34
|
spec.add_runtime_dependency "parallel", "~> 1.3"
|
24
35
|
spec.add_runtime_dependency "nokogiri", "~> 1.6"
|
36
|
+
spec.add_runtime_dependency "httparty", "~> 0.13.3"
|
25
37
|
end
|
data/lib/crowd_funding_parser.rb
CHANGED
@@ -4,3 +4,7 @@ require "crowd_funding_parser/general"
|
|
4
4
|
require "crowd_funding_parser/parser/flyingv"
|
5
5
|
require "crowd_funding_parser/parser/webackers"
|
6
6
|
require "crowd_funding_parser/parser/zeczec"
|
7
|
+
require "crowd_funding_parser/parser/taobao"
|
8
|
+
require "crowd_funding_parser/parser/kickstarter"
|
9
|
+
require "crowd_funding_parser/parser/hereo"
|
10
|
+
require "crowd_funding_parser/parser/an9"
|
@@ -1,50 +1,53 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require "active_support/all"
|
3
|
+
require_relative "method_builder"
|
4
|
+
|
1
5
|
module CrowdFundingParser
|
2
6
|
module Parser
|
3
7
|
class General
|
4
|
-
|
5
|
-
|
8
|
+
include HTTParty
|
9
|
+
|
10
|
+
def parse_tracking_data(result, project_url)
|
6
11
|
project = Hash.new
|
7
|
-
project[
|
8
|
-
project[
|
9
|
-
project[
|
10
|
-
project[
|
11
|
-
project[
|
12
|
-
|
13
|
-
project[
|
14
|
-
project['following_count'] = get_following_count(doc).to_i
|
12
|
+
project["money_goal"] = get_money_goal(result).to_i
|
13
|
+
project["money_pledged"] = get_money_pledged(result).to_i
|
14
|
+
project["backer_count"] = get_backer_count(result).to_i
|
15
|
+
project["left_time"] = get_left_time(result)
|
16
|
+
project["status"] = get_status(project["left_time"])
|
17
|
+
project["fb_count"] = get_fb_count(result).to_i
|
18
|
+
project["following_count"] = get_following_count(result).to_i
|
15
19
|
project
|
16
20
|
end
|
17
21
|
|
18
|
-
def parse_content_data(
|
19
|
-
|
20
|
-
project
|
21
|
-
project[
|
22
|
-
project[
|
23
|
-
project[
|
24
|
-
project[
|
25
|
-
project[
|
26
|
-
project[
|
27
|
-
project[
|
28
|
-
project[
|
22
|
+
def parse_content_data(result, project_url)
|
23
|
+
project = Hash.new
|
24
|
+
project["platform_project_id"] = get_id(project_url)
|
25
|
+
project["title"] = get_title(result)
|
26
|
+
project["url"] = project_url
|
27
|
+
project["summary"] = get_summary(result)
|
28
|
+
project["category"] = get_category(result)
|
29
|
+
project["creator_name"] = get_creator_name(result)
|
30
|
+
project["creator_id"] = get_creator_id(result)
|
31
|
+
project["creator_link"] = get_creator_link(result)
|
32
|
+
project["currency_string"] = get_currency_string(result)
|
33
|
+
project["start_date"] = get_start_date(result)
|
34
|
+
project["end_date"] = get_end_date(result)
|
35
|
+
project["region"] = get_region(result)
|
29
36
|
project
|
30
37
|
end
|
31
38
|
|
32
39
|
def get_project_links(required_status = "online")
|
33
40
|
links = []
|
34
|
-
|
35
|
-
|
41
|
+
|
42
|
+
get_lists.each do |target|
|
36
43
|
doc = Nokogiri::HTML(target)
|
37
44
|
online_projects = doc.css(@item_css_class)
|
38
45
|
|
39
46
|
Parallel.map(online_projects, in_processes: 2 , in_threads: 4) do |project|
|
40
47
|
link_nodes = project.css("a:nth-child(1)")
|
41
48
|
status = get_status(get_string(project.css(@status_css_class)))
|
42
|
-
link = link_nodes.first["href"]
|
43
|
-
if status ==
|
44
|
-
links << link
|
45
|
-
elsif status == "online" && required_status == "online"
|
46
|
-
links << link
|
47
|
-
elsif status == "preparing" && required_status == "preparing"
|
49
|
+
link = @platform_url + link_nodes.first["href"]
|
50
|
+
if status == required_status
|
48
51
|
links << link
|
49
52
|
end
|
50
53
|
end
|
@@ -53,42 +56,49 @@ module CrowdFundingParser
|
|
53
56
|
links
|
54
57
|
end
|
55
58
|
|
56
|
-
def
|
57
|
-
|
58
|
-
|
59
|
+
def get_result(project_url)
|
60
|
+
if @parse_method == :json
|
61
|
+
project_id = get_id(project_url)
|
62
|
+
project_api = get_project_api(project_id)
|
63
|
+
get_json_through_url(project_api)
|
64
|
+
else
|
65
|
+
get_doc_through_url(project_url)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def get_doc_through_url(project_url)
|
70
|
+
project_html = HTTParty.get(project_url)
|
71
|
+
Nokogiri::HTML(project_html)
|
59
72
|
end
|
60
73
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
74
|
+
def get_json_through_url(project_url)
|
75
|
+
httparty_url = HTTParty.get(project_url)
|
76
|
+
json = JSON.load(httparty_url.body)
|
64
77
|
end
|
65
78
|
|
66
|
-
def
|
67
|
-
|
79
|
+
def get_project(project_url)
|
80
|
+
result = get_result(project_url)
|
81
|
+
parse_content_data(result, project_url).merge parse_tracking_data(result, project_url)
|
68
82
|
end
|
69
83
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
Nokogiri::HTML(project_html)
|
84
|
+
def get_id(project_url)
|
85
|
+
rel_url = get_rel_url(project_url)
|
86
|
+
rel_url.split("/").last.split("?").first
|
74
87
|
end
|
75
88
|
|
76
89
|
private
|
77
90
|
|
78
|
-
def
|
79
|
-
|
91
|
+
def get_rel_url(url)
|
92
|
+
url.gsub("#{@platform_url}", "")
|
80
93
|
end
|
81
94
|
|
82
|
-
def
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
hours = ((left_time / (60 * 60)) % 24).to_i
|
89
|
-
minutes = ((left_time / 60) % 60).to_i
|
90
|
-
"#{days}天#{hours}小時#{minutes}分鐘"
|
95
|
+
def method_missing(m, *args, &block)
|
96
|
+
if m.to_s.match(/get/)
|
97
|
+
""
|
98
|
+
else
|
99
|
+
super
|
100
|
+
end
|
91
101
|
end
|
92
102
|
end
|
93
103
|
end
|
94
|
-
end
|
104
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module MethodBuilder
|
2
|
+
def self.set_methods(&block)
|
3
|
+
ParserMethodProxy.new.instance_eval(&block)
|
4
|
+
end
|
5
|
+
|
6
|
+
class ParserMethodProxy
|
7
|
+
def insert_parser(inserted_class)
|
8
|
+
@parser_class = "CrowdFundingParser::Parser::#{inserted_class}".constantize
|
9
|
+
@parser = @parser_class.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def set_variable(&block)
|
13
|
+
block.call
|
14
|
+
end
|
15
|
+
|
16
|
+
def set_method(method_name, &block)
|
17
|
+
@parser_class.send(:define_method, method_name) do |arg|
|
18
|
+
begin
|
19
|
+
block.call(arg)
|
20
|
+
rescue Exception => e
|
21
|
+
puts "Error #{e.message}"
|
22
|
+
puts e.backtrace.first
|
23
|
+
""
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def method_missing(m, *args, &block)
|
29
|
+
""
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_string(elements)
|
33
|
+
elements.first.text.strip
|
34
|
+
end
|
35
|
+
|
36
|
+
def money_string(money)
|
37
|
+
money.gsub("$","").gsub(",", "").gsub("NT", "")
|
38
|
+
end
|
39
|
+
|
40
|
+
def convert_time(left_time)
|
41
|
+
days = ((left_time / (60 * 60 * 24))).to_i
|
42
|
+
hours = ((left_time / (60 * 60)) % 24).to_i
|
43
|
+
minutes = ((left_time / 60) % 60).to_i
|
44
|
+
"#{days}天#{hours}小時#{minutes}分鐘"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
require "json"
|
2
|
+
require 'open-uri'
|
3
|
+
require "iconv"
|
4
|
+
|
5
|
+
module CrowdFundingParser
|
6
|
+
module Parser
|
7
|
+
class An9 < General
|
8
|
+
def initialize
|
9
|
+
@url = "http://www.an9.com.tw/Dream/"
|
10
|
+
@status_css_class = ".sideCon>a"
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_main_categories(add_categories)
|
14
|
+
add_categories.select { |c| c[:parent_id].nil? }
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_project_links(required_status = "online")
|
18
|
+
links = []
|
19
|
+
error_count = 0
|
20
|
+
not_found_count = 0
|
21
|
+
Parallel.each(1..100000, in_precesses: 2, in_threads: 5, progress: "Get #{self} links") do |i|
|
22
|
+
begin
|
23
|
+
link = @url + i.to_s
|
24
|
+
project = get_doc_through_url(link)
|
25
|
+
not_found_message = project.css(".actMsg p")
|
26
|
+
if not_found_message.present? && get_string(not_found_message).match(/不存在/)
|
27
|
+
not_found_count += 1
|
28
|
+
else
|
29
|
+
status = get_status(get_string(project.css(@status_css_class)))
|
30
|
+
|
31
|
+
if status == required_status
|
32
|
+
links << link
|
33
|
+
end
|
34
|
+
not_found_count = 0
|
35
|
+
error_count = 0
|
36
|
+
end
|
37
|
+
rescue Exception => e
|
38
|
+
error_count += 1
|
39
|
+
raise Parallel::Break if not_found_count >= 50 || error_count >= 50
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
links
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def get_title(result)
|
49
|
+
if @parse_method == :doc
|
50
|
+
get_string(result.css(".NS_projects__header h2 .green-dark"))
|
51
|
+
else
|
52
|
+
result["name"]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def get_category(result)
|
57
|
+
if @parse_method == :doc
|
58
|
+
get_string(result.css(".container-flex .h5 a.grey-dark:nth-child(2) b"))
|
59
|
+
else
|
60
|
+
result["category"]["name"]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_creator_name(result)
|
65
|
+
if @parse_method == :doc
|
66
|
+
get_string(result.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog"))
|
67
|
+
else
|
68
|
+
result["creator"]["name"]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_creator_id(result)
|
73
|
+
if @parse_method == :doc
|
74
|
+
creator_link = result.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog").first["href"]
|
75
|
+
creator_link.split("/")[-3]
|
76
|
+
else
|
77
|
+
result["creator"]["id"]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def get_creator_link(result)
|
82
|
+
if @parse_method == :doc
|
83
|
+
creator_link = @url + result.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog").first["href"]
|
84
|
+
else
|
85
|
+
result["creator"]["urls"]["web"]["user"]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def get_summary(result)
|
90
|
+
if @parse_method == :doc
|
91
|
+
get_string(result.css(".container-flex .col-8 .mobile-hide p.h3.mb3"))
|
92
|
+
else
|
93
|
+
result["blurb"]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def get_start_date(result)
|
98
|
+
if @parse_method == :doc
|
99
|
+
# no start date on page
|
100
|
+
else
|
101
|
+
Time.at(result["launched_at"])
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_end_date(result)
|
106
|
+
if @parse_method == :doc
|
107
|
+
result.css(".NS_projects__deadline_copy p.grey-dark time[datetime]")[0]["datetime"]
|
108
|
+
else
|
109
|
+
time = Time.at(result["deadline"])
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def get_region(result)
|
114
|
+
if @parse_method == :doc
|
115
|
+
get_string(result.css(".container-flex .h5 a.grey-dark:nth-child(1) b"))
|
116
|
+
else
|
117
|
+
result["location"]["displayable_name"]
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# for tracking
|
122
|
+
|
123
|
+
def get_money_goal(result)
|
124
|
+
if @parse_method == :doc
|
125
|
+
result.css("div[data-pledged]").first["data-goal"]
|
126
|
+
else
|
127
|
+
result["goal"]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def get_money_pledged(result)
|
132
|
+
if @parse_method == :doc
|
133
|
+
result.css("div[data-pledged]").first["data-pledged"]
|
134
|
+
else
|
135
|
+
result["pledged"]
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def get_backer_count(result)
|
140
|
+
if @parse_method == :doc
|
141
|
+
result.css("div[data-backers-count]").first["data-backers-count"]
|
142
|
+
else
|
143
|
+
result["backers_count"]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def get_left_time(result)
|
148
|
+
if @parse_method == :doc
|
149
|
+
end_date = result.css("div[data-end_time]").first["data-end_time"]
|
150
|
+
last_seconds = Time.parse(end_date) - Time.now
|
151
|
+
else
|
152
|
+
last_seconds = result["deadline"].to_i - Time.now.to_i
|
153
|
+
end
|
154
|
+
last_day = last_seconds.to_i / 86400
|
155
|
+
if last_day <= 0
|
156
|
+
"已結束"
|
157
|
+
else
|
158
|
+
last_day.to_s + "天"
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def get_status(button_text)
|
163
|
+
case button_text
|
164
|
+
when /贊助/
|
165
|
+
"online"
|
166
|
+
when /喜歡/
|
167
|
+
"voting"
|
168
|
+
when /結束|成功/
|
169
|
+
"finished"
|
170
|
+
else
|
171
|
+
"online"
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def get_fb_count(result)
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
def get_following_count(result)
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
def get_backer_list(project_url)
|
184
|
+
[]
|
185
|
+
end
|
186
|
+
|
187
|
+
def get_currency_string(result)
|
188
|
+
if @parse_method == :doc
|
189
|
+
result.css("data[data-currency]")[0]["data-currency"]
|
190
|
+
else
|
191
|
+
result["currency"]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|