crowd_funding_parser 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.travis.yml +17 -0
- data/Gemfile +2 -0
- data/Guardfile +11 -0
- data/README.md +2 -0
- data/Rakefile +8 -0
- data/crowd_funding_parser.gemspec +12 -0
- data/lib/crowd_funding_parser.rb +4 -0
- data/lib/crowd_funding_parser/general.rb +63 -53
- data/lib/crowd_funding_parser/method_builder.rb +47 -0
- data/lib/crowd_funding_parser/parser/an9.rb +196 -0
- data/lib/crowd_funding_parser/parser/flyingv.rb +64 -64
- data/lib/crowd_funding_parser/parser/hereo.rb +99 -0
- data/lib/crowd_funding_parser/parser/kickstarter.rb +158 -0
- data/lib/crowd_funding_parser/parser/taobao.rb +150 -0
- data/lib/crowd_funding_parser/parser/webackers.rb +73 -66
- data/lib/crowd_funding_parser/parser/zeczec.rb +79 -60
- data/lib/crowd_funding_parser/version.rb +1 -1
- data/spec/fixtures/vcr_cassettes/flyingv.yml +4869 -0
- data/spec/fixtures/vcr_cassettes/zeczec.yml +791 -0
- data/spec/parsers/flyingv_spec.rb +62 -0
- data/spec/parsers/zezec_spec.rb +62 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/support/vcr_sites.rb +8 -0
- metadata +191 -4
@@ -1,90 +1,90 @@
|
|
1
|
-
require 'open-uri'
|
2
|
-
|
3
1
|
module CrowdFundingParser
|
4
2
|
module Parser
|
5
3
|
class Flyingv < General
|
6
|
-
|
7
|
-
|
8
|
-
@url = "https://www.flyingv.cc"
|
9
|
-
@targets = []
|
4
|
+
MethodBuilder.set_methods do
|
5
|
+
insert_parser "Flyingv"
|
10
6
|
|
11
|
-
|
12
|
-
|
13
|
-
@
|
7
|
+
set_variable do
|
8
|
+
@platform_url = "https://www.flyingv.cc"
|
9
|
+
@time_regex = /(\d{4}\/\d{2}\/\d{2}).+(\d{4}\/\d{2}\/\d{2})/
|
14
10
|
end
|
15
11
|
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
set_method :get_title do |doc|
|
13
|
+
get_string(doc.css(".page-title-wrapper").css(".pagesTitle"))
|
14
|
+
end
|
19
15
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
end
|
16
|
+
set_method :get_category do |doc|
|
17
|
+
doc.css(".page-title-wrapper").css(".pageDes").first.css("a").first.text
|
18
|
+
end
|
24
19
|
|
25
|
-
|
26
|
-
|
27
|
-
|
20
|
+
set_method :get_creator_name do |doc|
|
21
|
+
doc.css(".page-title-wrapper").css(".pageDes")[1].css("a").first.text.strip
|
22
|
+
end
|
28
23
|
|
29
|
-
|
30
|
-
|
31
|
-
|
24
|
+
set_method :get_creator_id do |doc|
|
25
|
+
doc.css(".page-title-wrapper").css(".pageDes")[1].css("a").first["href"].split("/").last
|
26
|
+
end
|
32
27
|
|
33
|
-
|
34
|
-
|
35
|
-
|
28
|
+
set_method :get_creator_link do |doc|
|
29
|
+
@platform_url + doc.css(".profilemeta .imp a").first["href"]
|
30
|
+
end
|
36
31
|
|
37
|
-
|
38
|
-
|
39
|
-
|
32
|
+
set_method :get_summary do |doc|
|
33
|
+
doc.css(".project_content").first.text.to_s[0..500].strip
|
34
|
+
end
|
40
35
|
|
41
|
-
|
42
|
-
|
43
|
-
|
36
|
+
set_method :get_start_date do |doc|
|
37
|
+
text = get_string(doc.css(".col-xs-4.sidebarprj")).gsub(/\n/, "")
|
38
|
+
@time_regex.match(text)[1]
|
39
|
+
end
|
44
40
|
|
45
|
-
|
46
|
-
|
47
|
-
|
41
|
+
set_method :get_end_date do |doc|
|
42
|
+
text = get_string(doc.css(".col-xs-4.sidebarprj")).gsub(/\n/, "")
|
43
|
+
@time_regex.match(text)[2]
|
44
|
+
end
|
48
45
|
|
49
|
-
|
46
|
+
set_method :get_region do |doc|
|
47
|
+
"Taiwan"
|
48
|
+
end
|
50
49
|
|
51
|
-
|
52
|
-
|
53
|
-
|
50
|
+
set_method :get_money_goal do |doc|
|
51
|
+
money_string(get_string(doc.css(".countdes .dt .white")))
|
52
|
+
end
|
54
53
|
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
set_method :get_money_pledged do |doc|
|
55
|
+
money_string(get_string(doc.css(".countdes .ut .rtt h3")))
|
56
|
+
end
|
58
57
|
|
59
|
-
|
60
|
-
|
61
|
-
|
58
|
+
set_method :get_backer_count do |doc|
|
59
|
+
get_string(doc.css(".countdes .dt .pull-right")).sub("人贊助", "")
|
60
|
+
end
|
62
61
|
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
set_method :get_left_time do |doc|
|
63
|
+
get_string(doc.css(".countdes .dt div:nth-child(2)")).sub("剩餘", "")
|
64
|
+
end
|
66
65
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
66
|
+
set_method :get_status do |left_time|
|
67
|
+
if left_time.match("已結束")
|
68
|
+
"finished"
|
69
|
+
elsif left_time.match("開始")
|
70
|
+
"preparing"
|
71
|
+
else
|
72
|
+
"online"
|
73
|
+
end
|
74
74
|
end
|
75
|
-
end
|
76
75
|
|
77
|
-
|
78
|
-
|
79
|
-
|
76
|
+
set_method :get_fb_count do |doc|
|
77
|
+
get_string(doc.css("#fbBtn .sharenumber"))
|
78
|
+
end
|
80
79
|
|
81
|
-
|
82
|
-
|
83
|
-
|
80
|
+
set_method :get_following_count do |doc|
|
81
|
+
get_string(doc.css(".sidebarprj h5")).sub("人追踨", "").sub("追蹤", "").strip
|
82
|
+
end
|
84
83
|
|
85
|
-
|
86
|
-
|
84
|
+
set_method :get_currency_string do |result|
|
85
|
+
"twd"
|
86
|
+
end
|
87
87
|
end
|
88
88
|
end
|
89
89
|
end
|
90
|
-
end
|
90
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module CrowdFundingParser
|
2
|
+
module Parser
|
3
|
+
class Hereo < General
|
4
|
+
def initialize
|
5
|
+
@platform_url = "http://www.hereo.cc/"
|
6
|
+
@item_css_class = ".project-list ul li"
|
7
|
+
@status_css_class = ".projectImg .info .inner .detail span:nth-child(1)"
|
8
|
+
end
|
9
|
+
|
10
|
+
def get_lists
|
11
|
+
[HTTParty.get(@platform_url + "/project-list.php")]
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_id(project_url)
|
15
|
+
project_url.split("pid=").last
|
16
|
+
end
|
17
|
+
|
18
|
+
MethodBuilder.set_methods do
|
19
|
+
insert_parser "Hereo"
|
20
|
+
|
21
|
+
set_variable do
|
22
|
+
@platform_url = "http://www.hereo.cc/"
|
23
|
+
end
|
24
|
+
|
25
|
+
set_method :get_title do |doc|
|
26
|
+
get_string(doc.css(".container .text h3"))
|
27
|
+
end
|
28
|
+
|
29
|
+
set_method :get_category do |doc|
|
30
|
+
get_string(doc.css(".contentMain .projectTag"))
|
31
|
+
end
|
32
|
+
|
33
|
+
set_method :get_creator_name do |doc|
|
34
|
+
get_string(doc.css(".user-info .user .name h4 a"))
|
35
|
+
end
|
36
|
+
|
37
|
+
set_method :get_creator_id do |doc|
|
38
|
+
doc.css(".user-info .user .name h4 a")[0]["href"].match(/mid=(\d+)/)[1]
|
39
|
+
end
|
40
|
+
|
41
|
+
set_method :get_creator_link do |doc|
|
42
|
+
@platform_url + doc.css(".user-info .user .name h4 a")[0]["href"]
|
43
|
+
end
|
44
|
+
|
45
|
+
set_method :get_summary do |doc|
|
46
|
+
doc.css(".container div.text").first.text.gsub(/\s/, "")
|
47
|
+
end
|
48
|
+
|
49
|
+
set_method :get_start_date do |doc|
|
50
|
+
end
|
51
|
+
|
52
|
+
set_method :get_end_date do |doc|
|
53
|
+
doc.css(".projectInfo .detail .inner p").text.match(/\d{4}\/\d{2}\/\d{2}/).to_s
|
54
|
+
end
|
55
|
+
|
56
|
+
set_method :get_region do |doc|
|
57
|
+
"Taiwan"
|
58
|
+
end
|
59
|
+
|
60
|
+
set_method :get_money_pledged do |doc|
|
61
|
+
money_string(doc.css(".projectInfo .funded .inner .number strong").text.match(/[0-9,]+/).to_s)
|
62
|
+
end
|
63
|
+
|
64
|
+
set_method :get_money_goal do |doc|
|
65
|
+
money_string(get_string(doc.css(".sidebar h3.num")))
|
66
|
+
end
|
67
|
+
|
68
|
+
set_method :get_backer_count do |doc|
|
69
|
+
doc.css(".projectInfo .table .numberOfPeople .inner strong").text
|
70
|
+
end
|
71
|
+
|
72
|
+
set_method :get_left_time do |doc|
|
73
|
+
raw_string = doc.css(".projectInfo .table .time .inner").text.gsub(/\s/, "")
|
74
|
+
match_data = raw_string.match(/(\d+).*(天|小時)/)
|
75
|
+
match_data[1] + match_data[2]
|
76
|
+
end
|
77
|
+
|
78
|
+
set_method :get_status do |left_time|
|
79
|
+
if left_time.match("集資中")
|
80
|
+
"online"
|
81
|
+
elsif left_time.match("結束") || left_time.match("成功") || left_time.match(/\d+/).to_s == "0"
|
82
|
+
"finished"
|
83
|
+
else
|
84
|
+
"online"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
set_method :get_following_count do
|
89
|
+
doc.css("strong#track-count").text
|
90
|
+
end
|
91
|
+
|
92
|
+
set_method :get_currency_string do |result|
|
93
|
+
"twd"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
@@ -0,0 +1,158 @@
|
|
1
|
+
require "json"
|
2
|
+
require "iconv"
|
3
|
+
|
4
|
+
module CrowdFundingParser
|
5
|
+
module Parser
|
6
|
+
class Kickstarter < General
|
7
|
+
def initialize
|
8
|
+
@platform_url = "https://www.kickstarter.com"
|
9
|
+
@category_ids = [1, 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 26]
|
10
|
+
@parse_method = :doc
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_all_categories(status = "online")
|
14
|
+
status_code = get_status_code(status)
|
15
|
+
jsons = get_category_project_jsons(status_code)
|
16
|
+
jsons.flatten.compact!
|
17
|
+
categories = []
|
18
|
+
Parallel.each(jsons, in_precesses: 2, in_threads: 5) do |json|
|
19
|
+
category = { id: json["category"]["id"], name: json["category"]["name"], parent_id: json["category"]["parent_id"]}
|
20
|
+
categories << category
|
21
|
+
end
|
22
|
+
categories.uniq
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_project_links(status = "online")
|
26
|
+
status_code = get_status_code(status)
|
27
|
+
|
28
|
+
jsons = @category_ids.map do |category_id|
|
29
|
+
category_jsons = get_category_project_jsons(status_code, category_id)
|
30
|
+
end.flatten.compact
|
31
|
+
|
32
|
+
Parallel.map(jsons, in_precesses: 2, in_threads: 5) do |json|
|
33
|
+
unless json["state"] != "live" && json["pledged"].to_i == 0
|
34
|
+
if json["state"] == status_code
|
35
|
+
project_url = json["urls"]["web"]["project"]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def get_category_project_jsons(status_code = "live", category_id = 0)
|
42
|
+
jsons = []
|
43
|
+
|
44
|
+
Parallel.each(1..200, in_precesses: 2, in_threads: 5) do |i|
|
45
|
+
begin
|
46
|
+
api_url = get_projects_page_api(i, status_code, category_id)
|
47
|
+
json = get_json_through_url(api_url)["projects"]
|
48
|
+
jsons << json
|
49
|
+
rescue Exception => e
|
50
|
+
Parallel::Stop
|
51
|
+
end
|
52
|
+
end
|
53
|
+
jsons
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def get_project_page_api(project_url)
|
59
|
+
project_url.split("?").first + ".json"
|
60
|
+
end
|
61
|
+
|
62
|
+
def get_projects_page_api(page = 1, status_code = "live", category_id = 0)
|
63
|
+
"https://www.kickstarter.com/projects/search.json?page=#{page}&state=#{status_code}&category_id=#{category_id}"
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_project_search_doc_api(name)
|
67
|
+
"https://www.kickstarter.com/projects/search.json?term=#{name}"
|
68
|
+
end
|
69
|
+
|
70
|
+
def get_status_code(status)
|
71
|
+
case status
|
72
|
+
when "online"
|
73
|
+
"live"
|
74
|
+
when "finished"
|
75
|
+
"successful"
|
76
|
+
else
|
77
|
+
"live"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
MethodBuilder.set_methods do
|
82
|
+
insert_parser "Kickstarter"
|
83
|
+
|
84
|
+
set_variable do
|
85
|
+
@platform_url = "https://www.kickstarter.com"
|
86
|
+
end
|
87
|
+
|
88
|
+
set_method :get_title do |doc|
|
89
|
+
get_string(doc.css(".NS_projects__header h2 .green-dark"))
|
90
|
+
end
|
91
|
+
|
92
|
+
set_method :get_category do |doc|
|
93
|
+
get_string(doc.css(".container-flex .h5 a.grey-dark:nth-child(2) b"))
|
94
|
+
end
|
95
|
+
|
96
|
+
set_method :get_creator_name do |doc|
|
97
|
+
get_string(doc.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog"))
|
98
|
+
end
|
99
|
+
|
100
|
+
set_method :get_creator_id do |doc|
|
101
|
+
creator_path = doc.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog").first["href"]
|
102
|
+
creator_path.split("/")[-3]
|
103
|
+
end
|
104
|
+
|
105
|
+
set_method :get_creator_link do |doc|
|
106
|
+
@platform_url + doc.css(".NS_projects__creator .col-8>h5 a.remote_modal_dialog").first["href"]
|
107
|
+
end
|
108
|
+
|
109
|
+
set_method :get_summary do |doc|
|
110
|
+
get_string(doc.css(".container-flex .col-8 .mobile-hide p.h3.mb3"))
|
111
|
+
end
|
112
|
+
|
113
|
+
set_method :get_end_date do |doc|
|
114
|
+
doc.css(".NS_projects__deadline_copy p.grey-dark time[datetime]").try(:first).try(:[], "datetime")
|
115
|
+
end
|
116
|
+
|
117
|
+
set_method :get_region do |doc|
|
118
|
+
get_string(doc.css(".container-flex .h5 a.grey-dark:nth-child(1) b"))
|
119
|
+
end
|
120
|
+
|
121
|
+
set_method :get_money_pledged do |doc|
|
122
|
+
doc.css("div[data-pledged]").first["data-pledged"]
|
123
|
+
end
|
124
|
+
|
125
|
+
set_method :get_money_goal do |doc|
|
126
|
+
doc.css("div[data-pledged]").first["data-goal"]
|
127
|
+
end
|
128
|
+
|
129
|
+
set_method :get_backer_count do |doc|
|
130
|
+
doc.css("div[data-backers-count]").first["data-backers-count"]
|
131
|
+
end
|
132
|
+
|
133
|
+
set_method :get_left_time do |doc|
|
134
|
+
end_date = doc.css("div[data-end_time]").try(:first).try(:[], "data-end_time") || Time.now.to_s
|
135
|
+
last_seconds = Time.parse(end_date) - Time.now
|
136
|
+
last_day = last_seconds.to_i / 86400
|
137
|
+
if last_day <= 0
|
138
|
+
"已結束"
|
139
|
+
else
|
140
|
+
last_day.to_s + "天"
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
set_method :get_status do |left_time|
|
145
|
+
if left_time == "已結束"
|
146
|
+
"finished"
|
147
|
+
else
|
148
|
+
"online"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
set_method :get_currency_string do |doc|
|
153
|
+
doc.css("data[data-currency]")[0]["data-currency"]
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
require "json"
|
2
|
+
require "iconv"
|
3
|
+
|
4
|
+
module CrowdFundingParser
|
5
|
+
module Parser
|
6
|
+
class Taobao < General
|
7
|
+
def initialize
|
8
|
+
@parse_method = :json
|
9
|
+
@url = "http://hi.taobao.com/market/hi/detail2014.php?id="
|
10
|
+
end
|
11
|
+
|
12
|
+
def get_project_links(status = "online")
|
13
|
+
status_code = get_status_code(status)
|
14
|
+
jsons = get_total_jsons(status_code)
|
15
|
+
links = []
|
16
|
+
|
17
|
+
Parallel.each(jsons, in_precesses: 2, in_threads: 5) do |json|
|
18
|
+
project_id = json["id"].to_s
|
19
|
+
project_url = @url + project_id
|
20
|
+
links << project_url
|
21
|
+
end
|
22
|
+
links
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_status_code(status)
|
26
|
+
case status
|
27
|
+
when "online"
|
28
|
+
1
|
29
|
+
when "preparing"
|
30
|
+
3
|
31
|
+
when "finished"
|
32
|
+
2
|
33
|
+
else
|
34
|
+
1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def get_total_jsons(status = 1)
|
39
|
+
urls = get_total_json_apis(status)
|
40
|
+
jsons = []
|
41
|
+
urls.each do |url|
|
42
|
+
page_json = get_json_through_url(url)
|
43
|
+
json = page_json["data"]
|
44
|
+
jsons += json
|
45
|
+
end
|
46
|
+
jsons
|
47
|
+
end
|
48
|
+
|
49
|
+
def get_total_json_apis(status = 1)
|
50
|
+
page_count = get_total_page(status)
|
51
|
+
total_urls = []
|
52
|
+
page_count.to_i.times do |i|
|
53
|
+
total_urls << get_projects_page_api(i + 1, status)
|
54
|
+
end
|
55
|
+
total_urls
|
56
|
+
end
|
57
|
+
|
58
|
+
def get_total_page(status = 1)
|
59
|
+
url = "http://hstar-hi.alicdn.com/dream/ajax/getProjectList.htm?page=1&pageSize=20&projectType=&type=6&status=#{status}"
|
60
|
+
json = get_json_through_url(url)
|
61
|
+
page_count = json["pageTotal"]
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def get_project_api(project_id)
|
67
|
+
"http://hstar-hi.alicdn.com/dream/ajax/getProjectForDetail.htm?id=#{project_id}"
|
68
|
+
end
|
69
|
+
|
70
|
+
def get_projects_page_api(page = 1, status = 1)
|
71
|
+
"http://hstar-hi.alicdn.com/dream/ajax/getProjectList.htm?page=#{page}&pageSize=20&projectType=&type=6&status=#{status}"
|
72
|
+
end
|
73
|
+
|
74
|
+
def get_id(project_url)
|
75
|
+
project_url.split("id=").last
|
76
|
+
end
|
77
|
+
|
78
|
+
MethodBuilder.set_methods do
|
79
|
+
insert_parser "Taobao"
|
80
|
+
|
81
|
+
set_method :get_title do |result|
|
82
|
+
result["data"]["name"]
|
83
|
+
end
|
84
|
+
|
85
|
+
set_method :get_creator_name do |result|
|
86
|
+
raw_creator_name = result["data"]["person"]["name"]
|
87
|
+
@parser.encode_gbk_to_utf(raw_creator_name)
|
88
|
+
end
|
89
|
+
|
90
|
+
set_method :get_summary do |result|
|
91
|
+
raw_summary = result["data"]["desc"]
|
92
|
+
@parser.encode_gbk_to_utf(raw_summary)
|
93
|
+
end
|
94
|
+
|
95
|
+
set_method :get_start_date do |result|
|
96
|
+
result["data"]["begin_date"]
|
97
|
+
end
|
98
|
+
|
99
|
+
set_method :get_end_date do |result|
|
100
|
+
result["data"]["end_date"]
|
101
|
+
end
|
102
|
+
|
103
|
+
set_method :get_region do |result|
|
104
|
+
"China"
|
105
|
+
end
|
106
|
+
|
107
|
+
set_method :get_money_goal do |result|
|
108
|
+
result["data"]["target_money"]
|
109
|
+
end
|
110
|
+
|
111
|
+
set_method :get_money_pledged do |result|
|
112
|
+
result["data"]["curr_money"]
|
113
|
+
end
|
114
|
+
|
115
|
+
set_method :get_backer_count do |result|
|
116
|
+
result["data"]["support_person"]
|
117
|
+
end
|
118
|
+
|
119
|
+
set_method :get_left_time do |result|
|
120
|
+
result["data"]["remain_day"]
|
121
|
+
end
|
122
|
+
|
123
|
+
set_method :get_status do |left_time|
|
124
|
+
if left_time == "0"
|
125
|
+
"finished"
|
126
|
+
else
|
127
|
+
"online"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
set_method :get_following_count do |result|
|
132
|
+
result["data"]["focus_count"]
|
133
|
+
end
|
134
|
+
|
135
|
+
set_method :get_currency_string do |result|
|
136
|
+
"cny"
|
137
|
+
end
|
138
|
+
|
139
|
+
set_method :encode_gbk_to_utf do |string|
|
140
|
+
begin
|
141
|
+
Iconv.conv("utf-8//ignore", "gb2312//ignore", string)
|
142
|
+
rescue Exception => e
|
143
|
+
puts e
|
144
|
+
string
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|