fbcrawl-colly 0.2.4 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0fc0e07942d352bb9b49c93a106d138bc9946f8e1943bcd81f6181082e79c413
4
- data.tar.gz: 1e31f7fe0bc3bf83c82b90d84080a257734cfe63eaa4d79fdb89aa52204e2eb4
3
+ metadata.gz: f420f1cc6b4b6260b8106422e80c0488a351fb21fdcaa0ad1af56b0402beb8da
4
+ data.tar.gz: 86193bbf6bfab5b48f73ad4f2594f5e4ff0761901ec5653af90ea638cbf9def6
5
5
  SHA512:
6
- metadata.gz: 55d00db7f51b078c1ca7c46a59b06eb4d62b1859fac6e11a2da3c117f7c2dd3958d442559f9b84573f4d4f5e2d02a77539287d5628ff60f421405a3e2910e2b7
7
- data.tar.gz: 550b26405d7bbd13356f1ca7ef30edfa682c04364f2f8fc409269c74fe25e8cb0535464d395083109b04e17de129bd851da3f035dd2dec6af2095ed357a1ddd7
6
+ metadata.gz: c6396d3572f0beaedaf198388e688e4f3bed09e6920c260cfddf61c36267c97e2a6babea3d297e07d1b7a816f6b416ec7c6d1a27775d85bea23dc4b5b07d9a51
7
+ data.tar.gz: a94d9ed5fc604e6e001abfb7f93ac9a82a075b686ada0214e854761ea0aa84712651b6605a7f885dcadc45be1dcad7d8bf2ac580f0e61dd2d3b086b02849c440
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.4)
5
- ffi
4
+ fbcrawl-colly (1.1.0)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
11
  google-protobuf (3.13.0)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.31.1)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,9 +1,80 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Login (LoginRequest) returns (LoginResponse) {}
9
+ rpc FetchMyGroups (FetchMyGroupsRequest) returns (FacebookGroupList) {}
10
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
11
+ rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
12
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
13
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
14
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
15
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
16
+ }
17
+
18
+ message Context {
19
+ string cookies = 1;
20
+ }
21
+
22
+ message LoginRequest {
23
+ string email = 2;
24
+ string password = 3;
25
+ string totp_secret = 4;
26
+ }
27
+
28
+ message LoginResponse {
29
+ string cookies = 1;
30
+ }
31
+
32
+ message LoginWithCookiesRequest {
33
+ string cookies = 1;
34
+ }
35
+
36
+ message FetchMyGroupsRequest {
37
+ Context context = 1;
38
+ }
39
+
40
+ message FetchGroupInfoRequest {
41
+ Context context = 1;
42
+ string group_username = 2;
43
+ }
44
+
45
+ message FetchUserInfoRequest {
46
+ Context context = 1;
47
+ string username = 2;
48
+ }
49
+
50
+ message FetchGroupFeedRequest {
51
+ Context context = 1;
52
+ int64 group_id = 2;
53
+ string next_cursor = 3;
54
+ }
55
+
56
+ message FetchPostRequest {
57
+ Context context = 1;
58
+ int64 group_id = 2;
59
+ int64 post_id = 3;
60
+ string comment_next_cursor = 4;
61
+ }
62
+
63
+ message FetchContentImagesRequest {
64
+ Context context = 1;
65
+ int64 post_id = 2;
66
+ string next_cursor = 3;
67
+ }
68
+
69
+ message FetchImageUrlRequest {
70
+ Context context = 1;
71
+ int64 image_id = 2;
72
+ }
73
+
74
+ message FacebookGroupList {
75
+ repeated FacebookGroup groups = 1;
76
+ }
5
77
 
6
- // The request message containing the user's name.
7
78
  message FacebookGroup {
8
79
  int64 id = 1;
9
80
  string name = 2;
@@ -13,6 +84,8 @@ message FacebookGroup {
13
84
  message FacebookUser {
14
85
  int64 id = 1;
15
86
  string name = 2;
87
+ string username = 3;
88
+ int64 friend_count = 4;
16
89
  }
17
90
 
18
91
  message FacebookPost {
@@ -20,15 +93,20 @@ message FacebookPost {
20
93
  FacebookGroup group = 2;
21
94
  FacebookUser user = 3;
22
95
  string content = 4;
96
+ CommentList comments = 5;
23
97
  string content_link = 6;
24
- FacebookImage content_image = 8;
25
98
  repeated FacebookImage content_images = 7;
26
- repeated FacebookComment comments = 5;
99
+ FacebookImage content_image = 8;
27
100
  int64 created_at = 9;
28
101
  int64 reaction_count = 10;
29
102
  int64 comment_count = 11;
30
103
  }
31
104
 
105
+ message CommentList {
106
+ repeated FacebookComment comments = 5;
107
+ string next_cursor = 12;
108
+ }
109
+
32
110
  message FacebookImage {
33
111
  int64 id = 1;
34
112
  string url = 2;
@@ -44,8 +122,10 @@ message FacebookComment {
44
122
 
45
123
  message FacebookPostList {
46
124
  repeated FacebookPost posts = 1;
125
+ string next_cursor = 2;
47
126
  }
48
127
 
49
128
  message FacebookImageList {
50
129
  repeated FacebookImage images = 1;
130
+ string next_cursor = 2;
51
131
  }
@@ -6,16 +6,17 @@ import (
6
6
  "errors"
7
7
  "fmt"
8
8
  "github.com/PuerkitoBio/goquery"
9
- "github.com/gocolly/colly"
10
- "github.com/gocolly/colly/extensions"
11
- "github.com/gocolly/colly/storage"
9
+ "github.com/gocolly/colly/v2"
10
+ "github.com/gocolly/colly/v2/extensions"
11
+ "github.com/gocolly/colly/v2/storage"
12
12
  "github.com/google/logger"
13
13
  "github.com/olebedev/when"
14
14
  "github.com/olebedev/when/rules/common"
15
15
  "github.com/olebedev/when/rules/en"
16
16
  "github.com/thoas/go-funk"
17
+ "github.com/xlzd/gotp"
17
18
  "net/url"
18
- "qnetwork.net/fbcrawl/fbcrawl"
19
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
19
20
  "regexp"
20
21
  "strconv"
21
22
  "strings"
@@ -33,7 +34,7 @@ type FbDataInsight struct {
33
34
  FbDataPostContext `json:"post_context"`
34
35
  }
35
36
  type FbDataFt struct {
36
- ContentOwnerIdNew int64 `json:"content_owner_id_new"`
37
+ ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
37
38
  PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
39
  PhotoId int64 `json:"photo_id,string"`
39
40
  PageId int64 `json:"page_id,string"`
@@ -41,30 +42,38 @@ type FbDataFt struct {
41
42
  PageInsights map[string]FbDataInsight `json:"page_insights"`
42
43
  }
43
44
 
44
- func sharedOnRequest(request *colly.Request) {
45
- logger.Info("OnRequest")
46
- //request.Headers.Set("Host", "facebook.com")
47
- request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
48
- request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
49
- request.Headers.Set("origin", "https://mbasic.facebook.com")
50
-
51
- //logger.Info("Saved referrer is", request.Ctx.Get("_referer"))
52
- request.Headers.Set("referer", "https://mbasic.facebook.com/checkpoint/?_rdr")
53
- request.Headers.Set("cache-control", "max-age=0")
54
- request.Headers.Set("upgrade-insecure-requests", "1")
55
- //accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
56
- //origin: https://mbasic.facebook.com
57
- //referer: https://mbasic.facebook.com/checkpoint/?_rdr
58
- request.Headers.Set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36")
59
- request.ResponseCharacterEncoding = "utf-8"
60
- }
61
-
62
45
  func setupSharedCollector(collector *colly.Collector) error {
63
46
  var err error
64
47
  extensions.Referer(collector)
65
48
  collector.AllowURLRevisit = true
66
- collector.OnRequest(sharedOnRequest)
67
- collector.OnResponse(sharedOnResponse)
49
+ var lastUrl string
50
+ collector.OnRequest(func(request *colly.Request) {
51
+ lastUrl = request.URL.RawPath
52
+ logger.Info("OnRequest ", request.URL)
53
+ //request.Headers.Set("Host", "facebook.com")
54
+ request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
55
+ request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
56
+ request.Headers.Set("origin", "https://mbasic.facebook.com")
57
+
58
+ //logger.Info("Saved referrer is", request.Ctx.Get("_referer"))
59
+ request.Headers.Set("referer", "https://mbasic.facebook.com/checkpoint/?_rdr")
60
+ request.Headers.Set("cache-control", "max-age=0")
61
+ request.Headers.Set("upgrade-insecure-requests", "1")
62
+ //accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
63
+ //origin: https://mbasic.facebook.com
64
+ //referer: https://mbasic.facebook.com/checkpoint/?_rdr
65
+ request.Headers.Set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36")
66
+ request.ResponseCharacterEncoding = "utf-8"
67
+ })
68
+ collector.OnResponse(func(response *colly.Response) {
69
+ logger.Info("OnResponse ./last.html")
70
+ _ = response.Save("./last.html")
71
+ //logger.Info(string(resp.Body))
72
+ })
73
+
74
+ collector.OnHTML("a[href*=\"177066345680802\"", func(element *colly.HTMLElement) {
75
+ logger.Error("RateLimit reached ", lastUrl)
76
+ })
68
77
  collector.OnError(func(resp *colly.Response, errHttp error) {
69
78
  err = errHttp
70
79
  logger.Error("OnError", err)
@@ -72,12 +81,6 @@ func setupSharedCollector(collector *colly.Collector) error {
72
81
  return err
73
82
  }
74
83
 
75
- func sharedOnResponse(response *colly.Response) {
76
- logger.Info("OnResponse ./last.html")
77
- _ = response.Save("./last.html")
78
- //logger.Info(string(resp.Body))
79
- }
80
-
81
84
  func getForm(element *colly.HTMLElement, err error) (string, error, map[string]string) {
82
85
  submitUrl, exists := element.DOM.Attr("action")
83
86
  if !exists {
@@ -106,7 +109,7 @@ func New() *Fbcolly {
106
109
  return &f
107
110
  }
108
111
 
109
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
112
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
110
113
  collector := f.collector.Clone()
111
114
  err := setupSharedCollector(collector)
112
115
 
@@ -155,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
155
158
  //logger.Info("Please input OTP")
156
159
  //reader := bufio.NewReader(os.Stdin)
157
160
  //code, _ := reader.ReadString('\n')
158
- code := otp[0:6]
159
- reqMap["approvals_code"] = code
160
- shouldSubmit = true
161
+ if len(totpSecret) > 0 {
162
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
163
+ reqMap["approvals_code"] = code
164
+ shouldSubmit = true
165
+ }
166
+
161
167
  } else {
162
168
  logger.Info("OnHTML Only Continue checkpoint")
163
169
 
@@ -191,22 +197,17 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
191
197
 
192
198
  }
193
199
 
194
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
200
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
195
201
  collector := f.collector.Clone()
196
202
  err := setupSharedCollector(collector)
197
- currentPage := 1
198
- var result []*fbcrawl.FacebookPost
203
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
199
204
 
200
205
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
201
- currentPage++
202
- if currentPage < 3 {
203
- logger.Info("Will fetch page", currentPage)
204
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
205
- }
206
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
206
207
  })
207
- collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
208
+ collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
208
209
  dataElement := element
209
- post := &fbcrawl.FacebookPost{}
210
+ post := &pb.FacebookPost{}
210
211
  var fbDataFt FbDataFt
211
212
  jsonData := dataElement.Attr("data-ft")
212
213
 
@@ -218,9 +219,10 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
218
219
  }
219
220
  logger.Info("Post ", fbDataFt)
220
221
  post.Id = fbDataFt.TopLevelPostId
221
- post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
222
- post.User = &fbcrawl.FacebookUser{
223
- Id: fbDataFt.ContentOwnerIdNew,
222
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
223
+ userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
224
+ post.User = &pb.FacebookUser{
225
+ Id: userId,
224
226
  Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
225
227
  }
226
228
  post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
@@ -239,77 +241,110 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
239
241
  post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
240
242
  post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
241
243
  post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
242
- post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
244
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
243
245
  i, _ := strconv.ParseInt(id, 10, 64)
244
- return &fbcrawl.FacebookImage{
246
+ return &pb.FacebookImage{
245
247
  Id: i,
246
248
  }
247
- })).([]*fbcrawl.FacebookImage)
249
+ })).([]*pb.FacebookImage)
248
250
 
249
251
  if fbDataFt.PhotoId > 0 {
250
- post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
252
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
251
253
  }
252
- result = append(result, post)
254
+ result.Posts = append(result.Posts, post)
255
+ })
256
+ if len(nextCursor) > 0 {
257
+ err = collector.Visit(nextCursor)
258
+ } else {
259
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
260
+ }
261
+
262
+ if err != nil {
263
+ logger.Error("crawl by colly err:", err)
264
+ }
265
+ return err, &result
266
+ }
267
+
268
+ func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
269
+ collector := f.collector.Clone()
270
+ err := setupSharedCollector(collector)
271
+
272
+ result := &pb.FacebookUser{}
273
+
274
+ collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
275
+ parsed, _ := url.Parse(element.Attr("href"))
276
+ result.Username = strings.Split(parsed.Path[1:], "/")[0]
277
+ result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
278
+ })
279
+
280
+ collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
281
+ result.FriendCount = getNumberFromText(element.Text)
282
+ })
283
+
284
+ collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
285
+ result.Name = element.DOM.Find("strong").First().Text()
253
286
  })
254
287
 
255
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
288
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
256
289
  if err != nil {
257
290
  logger.Error("crawl by colly err:", err)
258
291
  }
259
- return err, &fbcrawl.FacebookPostList{Posts: result}
292
+ return err, result
260
293
  }
261
294
 
262
- func (f *Fbcolly) FetchGroupInfo(groupId int64) (error, *fbcrawl.FacebookGroup) {
295
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
263
296
  collector := f.collector.Clone()
264
297
  err := setupSharedCollector(collector)
265
- result := &fbcrawl.FacebookGroup{Id: groupId}
298
+ result := &pb.FacebookGroup{}
266
299
 
267
300
  collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
268
301
  result.Name = element.Text
269
302
  })
270
303
  collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
271
- result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(),10,64)
304
+ result.Id = getNumberFromText(element.Attr("href"))
305
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
272
306
  })
273
307
 
274
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d?view=info", groupId))
308
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
275
309
  if err != nil {
276
310
  logger.Error("crawl by colly err:", err)
277
311
  }
278
312
  return err, result
279
313
  }
280
314
 
281
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
315
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
282
316
  collector := f.collector.Clone()
283
317
  err := setupSharedCollector(collector)
284
- currentPage := 1
285
- var result []*fbcrawl.FacebookImage
318
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
286
319
 
287
320
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
288
- currentPage++
289
- logger.Info("Will fetch page", currentPage)
290
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
321
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
291
322
  })
292
323
 
293
324
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
294
- result = append(result, &fbcrawl.FacebookImage{
325
+ result.Images = append(result.Images, &pb.FacebookImage{
295
326
  Id: getImageIdFromHref(element.Attr("href")),
296
327
  })
297
328
  //f.detailCollector.Visit(url)
298
329
  })
330
+ if len(nextCursor) > 0 {
331
+ err = collector.Visit(nextCursor)
332
+ } else {
333
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
334
+ }
299
335
 
300
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
301
336
  if err != nil {
302
337
  logger.Error("crawl by colly err:", err)
303
338
  }
304
- return err, &fbcrawl.FacebookImageList{Images: result}
339
+ return err, &result
305
340
  }
306
341
 
307
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
342
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
308
343
  collector := f.collector.Clone()
309
344
  err := setupSharedCollector(collector)
310
- result := fbcrawl.FacebookImage{Id: imageId}
345
+ result := pb.FacebookImage{Id: imageId}
311
346
 
312
- collector.OnHTML("a", func(element *colly.HTMLElement) {
347
+ collector.OnHTML("a[href*=\"fbcdn\"]", func(element *colly.HTMLElement) {
313
348
  result.Url = element.Attr("href")
314
349
  })
315
350
 
@@ -320,11 +355,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
320
355
  return err, &result
321
356
  }
322
357
 
323
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
358
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
324
359
  collector := f.collector.Clone()
325
360
  err := setupSharedCollector(collector)
326
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
327
- commentPaging := 0
361
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
328
362
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
329
363
  dataElement := element.DOM.Find("div[data-ft]")
330
364
  if dataElement.Length() > 0 {
@@ -339,9 +373,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
339
373
  }
340
374
  logger.Info("Post ", result)
341
375
  post.Id = result.TopLevelPostId
342
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
343
- post.User = &fbcrawl.FacebookUser{
344
- Id: result.ContentOwnerIdNew,
376
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
377
+ userId, _ := result.ContentOwnerIdNew.Int64()
378
+ post.User = &pb.FacebookUser{
379
+ Id: userId,
345
380
  Name: dataElement.Find("h3 strong:first-child a").Text(),
346
381
  }
347
382
  post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
@@ -359,15 +394,15 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
359
394
 
360
395
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
361
396
  post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
362
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
397
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
363
398
  i, _ := strconv.ParseInt(id, 10, 64)
364
- return &fbcrawl.FacebookImage{
399
+ return &pb.FacebookImage{
365
400
  Id: i,
366
401
  }
367
- })).([]*fbcrawl.FacebookImage)
402
+ })).([]*pb.FacebookImage)
368
403
 
369
404
  if result.PhotoId > 0 {
370
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
405
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
371
406
  }
372
407
  }
373
408
 
@@ -375,32 +410,48 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
375
410
  element.DOM.Find("h3 + div + div + div").Parent().Parent().Each(func(i int, selection *goquery.Selection) {
376
411
  //author
377
412
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
378
- logger.Info("comment", commentId)
379
- createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
380
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
381
- Id: commentId,
382
- Post: &fbcrawl.FacebookPost{Id: post.Id},
383
- User: &fbcrawl.FacebookUser{
384
- Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
385
- Name: selection.Find("h3 > a").Text(),
386
- },
387
- Content: selection.Find("h3 + div").Text(),
388
- CreatedAt: createdAtWhenResult.Time.Unix(),
389
- })
413
+ if commentId > 0 {
414
+ createdAtWhenResult, err := f.w.Parse(selection.Find("abbr").Text(), time.Now())
415
+ if err != nil {
416
+ logger.Error(err)
417
+ return
418
+ }
419
+ parsed, err := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
420
+ if err != nil {
421
+ logger.Error(err)
422
+ return
423
+ }
424
+ if len(parsed.Path) == 0 {
425
+ logger.Error("Empty path for commentId ", commentId)
426
+ return
427
+ }
428
+ if len(parsed.Path) > 1 {
429
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
430
+ Id: commentId,
431
+ Post: &pb.FacebookPost{Id: post.Id},
432
+ User: &pb.FacebookUser{
433
+ Username: parsed.Path[1:],
434
+ Name: selection.Find("h3 > a").Text(),
435
+ },
436
+ Content: selection.Find("h3 + div").Text(),
437
+ CreatedAt: createdAtWhenResult.Time.Unix(),
438
+ })
439
+ }
440
+ }
390
441
  })
391
442
 
392
443
  }
393
444
  })
394
445
 
395
446
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
396
- if commentPaging < 3 {
397
- logger.Info("Comment paging", commentPaging)
398
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
399
- commentPaging = commentPaging + 1
400
- }
447
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
401
448
  })
449
+ if len(commentNextCursor) > 0 {
450
+ err = collector.Visit(commentNextCursor)
451
+ } else {
452
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
453
+ }
402
454
 
403
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
404
455
  return err, post
405
456
  }
406
457
 
@@ -409,13 +460,36 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
409
460
  return collector.SetCookies("https://mbasic.facebook.com/", storage.UnstringifyCookies(cookies))
410
461
  }
411
462
 
463
+ func (f *Fbcolly) FetchMyGroups() (error, *pb.FacebookGroupList) {
464
+ collector := f.collector.Clone()
465
+ err := setupSharedCollector(collector)
466
+ result := &pb.FacebookGroupList{Groups: []*pb.FacebookGroup{}}
467
+
468
+ collector.OnHTML("li table a", func(element *colly.HTMLElement) {
469
+ result.Groups = append(result.Groups, &pb.FacebookGroup{
470
+ Id: getNumberFromText(element.Attr("href")),
471
+ Name: element.Text,
472
+ })
473
+ })
474
+
475
+ err = collector.Visit("https://mbasic.facebook.com/groups/?seemore")
476
+ if err != nil {
477
+ logger.Error("crawl by colly err:", err)
478
+ }
479
+ return err, result
480
+ }
481
+
412
482
  //func getUsernameFromHref(href string) string {
413
483
  // return regexp.MustCompile("/([\\d\\w.]+).*").FindStringSubmatch(href)[1]
414
484
  //}
415
485
 
416
486
  func getUserIdFromCommentHref(href string) int64 {
417
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
418
- return id
487
+ match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
488
+ if len(match) > 0 {
489
+ id, _ := strconv.ParseInt(match[1], 10, 64)
490
+ return id
491
+ }
492
+ return 0
419
493
  }
420
494
 
421
495
  func getUrlFromRedirectHref(href string) string {
@@ -430,17 +504,19 @@ func getImageIdFromHref(href string) int64 {
430
504
  }
431
505
 
432
506
  func getNumberFromText(text string) int64 {
433
- logger.Error("reaction", text)
507
+ logger.Info("getNumberFromText ", text)
434
508
  if len(text) > 0 {
435
- match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
436
- count, _ := strconv.ParseInt(match[1], 10, 64)
437
- switch match[2] {
438
- case "k":
439
- count *= 1000
440
- case "m":
441
- count *= 1000000
509
+ match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
510
+ if len(match) > 0 {
511
+ count, _ := strconv.ParseInt(match[1], 10, 64)
512
+ switch match[2] {
513
+ case "k":
514
+ count *= 1000
515
+ case "m":
516
+ count *= 1000000
517
+ }
518
+ return count
442
519
  }
443
- return count
444
520
  }
445
521
  return 0
446
522
  }