fbcrawl-colly 0.2.4 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0fc0e07942d352bb9b49c93a106d138bc9946f8e1943bcd81f6181082e79c413
4
- data.tar.gz: 1e31f7fe0bc3bf83c82b90d84080a257734cfe63eaa4d79fdb89aa52204e2eb4
3
+ metadata.gz: f420f1cc6b4b6260b8106422e80c0488a351fb21fdcaa0ad1af56b0402beb8da
4
+ data.tar.gz: 86193bbf6bfab5b48f73ad4f2594f5e4ff0761901ec5653af90ea638cbf9def6
5
5
  SHA512:
6
- metadata.gz: 55d00db7f51b078c1ca7c46a59b06eb4d62b1859fac6e11a2da3c117f7c2dd3958d442559f9b84573f4d4f5e2d02a77539287d5628ff60f421405a3e2910e2b7
7
- data.tar.gz: 550b26405d7bbd13356f1ca7ef30edfa682c04364f2f8fc409269c74fe25e8cb0535464d395083109b04e17de129bd851da3f035dd2dec6af2095ed357a1ddd7
6
+ metadata.gz: c6396d3572f0beaedaf198388e688e4f3bed09e6920c260cfddf61c36267c97e2a6babea3d297e07d1b7a816f6b416ec7c6d1a27775d85bea23dc4b5b07d9a51
7
+ data.tar.gz: a94d9ed5fc604e6e001abfb7f93ac9a82a075b686ada0214e854761ea0aa84712651b6605a7f885dcadc45be1dcad7d8bf2ac580f0e61dd2d3b086b02849c440
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.4)
5
- ffi
4
+ fbcrawl-colly (1.1.0)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
11
  google-protobuf (3.13.0)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.31.1)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,9 +1,80 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Login (LoginRequest) returns (LoginResponse) {}
9
+ rpc FetchMyGroups (FetchMyGroupsRequest) returns (FacebookGroupList) {}
10
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
11
+ rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
12
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
13
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
14
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
15
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
16
+ }
17
+
18
+ message Context {
19
+ string cookies = 1;
20
+ }
21
+
22
+ message LoginRequest {
23
+ string email = 2;
24
+ string password = 3;
25
+ string totp_secret = 4;
26
+ }
27
+
28
+ message LoginResponse {
29
+ string cookies = 1;
30
+ }
31
+
32
+ message LoginWithCookiesRequest {
33
+ string cookies = 1;
34
+ }
35
+
36
+ message FetchMyGroupsRequest {
37
+ Context context = 1;
38
+ }
39
+
40
+ message FetchGroupInfoRequest {
41
+ Context context = 1;
42
+ string group_username = 2;
43
+ }
44
+
45
+ message FetchUserInfoRequest {
46
+ Context context = 1;
47
+ string username = 2;
48
+ }
49
+
50
+ message FetchGroupFeedRequest {
51
+ Context context = 1;
52
+ int64 group_id = 2;
53
+ string next_cursor = 3;
54
+ }
55
+
56
+ message FetchPostRequest {
57
+ Context context = 1;
58
+ int64 group_id = 2;
59
+ int64 post_id = 3;
60
+ string comment_next_cursor = 4;
61
+ }
62
+
63
+ message FetchContentImagesRequest {
64
+ Context context = 1;
65
+ int64 post_id = 2;
66
+ string next_cursor = 3;
67
+ }
68
+
69
+ message FetchImageUrlRequest {
70
+ Context context = 1;
71
+ int64 image_id = 2;
72
+ }
73
+
74
+ message FacebookGroupList {
75
+ repeated FacebookGroup groups = 1;
76
+ }
5
77
 
6
- // The request message containing the user's name.
7
78
  message FacebookGroup {
8
79
  int64 id = 1;
9
80
  string name = 2;
@@ -13,6 +84,8 @@ message FacebookGroup {
13
84
  message FacebookUser {
14
85
  int64 id = 1;
15
86
  string name = 2;
87
+ string username = 3;
88
+ int64 friend_count = 4;
16
89
  }
17
90
 
18
91
  message FacebookPost {
@@ -20,15 +93,20 @@ message FacebookPost {
20
93
  FacebookGroup group = 2;
21
94
  FacebookUser user = 3;
22
95
  string content = 4;
96
+ CommentList comments = 5;
23
97
  string content_link = 6;
24
- FacebookImage content_image = 8;
25
98
  repeated FacebookImage content_images = 7;
26
- repeated FacebookComment comments = 5;
99
+ FacebookImage content_image = 8;
27
100
  int64 created_at = 9;
28
101
  int64 reaction_count = 10;
29
102
  int64 comment_count = 11;
30
103
  }
31
104
 
105
+ message CommentList {
106
+ repeated FacebookComment comments = 5;
107
+ string next_cursor = 12;
108
+ }
109
+
32
110
  message FacebookImage {
33
111
  int64 id = 1;
34
112
  string url = 2;
@@ -44,8 +122,10 @@ message FacebookComment {
44
122
 
45
123
  message FacebookPostList {
46
124
  repeated FacebookPost posts = 1;
125
+ string next_cursor = 2;
47
126
  }
48
127
 
49
128
  message FacebookImageList {
50
129
  repeated FacebookImage images = 1;
130
+ string next_cursor = 2;
51
131
  }
@@ -6,16 +6,17 @@ import (
6
6
  "errors"
7
7
  "fmt"
8
8
  "github.com/PuerkitoBio/goquery"
9
- "github.com/gocolly/colly"
10
- "github.com/gocolly/colly/extensions"
11
- "github.com/gocolly/colly/storage"
9
+ "github.com/gocolly/colly/v2"
10
+ "github.com/gocolly/colly/v2/extensions"
11
+ "github.com/gocolly/colly/v2/storage"
12
12
  "github.com/google/logger"
13
13
  "github.com/olebedev/when"
14
14
  "github.com/olebedev/when/rules/common"
15
15
  "github.com/olebedev/when/rules/en"
16
16
  "github.com/thoas/go-funk"
17
+ "github.com/xlzd/gotp"
17
18
  "net/url"
18
- "qnetwork.net/fbcrawl/fbcrawl"
19
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
19
20
  "regexp"
20
21
  "strconv"
21
22
  "strings"
@@ -33,7 +34,7 @@ type FbDataInsight struct {
33
34
  FbDataPostContext `json:"post_context"`
34
35
  }
35
36
  type FbDataFt struct {
36
- ContentOwnerIdNew int64 `json:"content_owner_id_new"`
37
+ ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
37
38
  PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
39
  PhotoId int64 `json:"photo_id,string"`
39
40
  PageId int64 `json:"page_id,string"`
@@ -41,30 +42,38 @@ type FbDataFt struct {
41
42
  PageInsights map[string]FbDataInsight `json:"page_insights"`
42
43
  }
43
44
 
44
- func sharedOnRequest(request *colly.Request) {
45
- logger.Info("OnRequest")
46
- //request.Headers.Set("Host", "facebook.com")
47
- request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
48
- request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
49
- request.Headers.Set("origin", "https://mbasic.facebook.com")
50
-
51
- //logger.Info("Saved referrer is", request.Ctx.Get("_referer"))
52
- request.Headers.Set("referer", "https://mbasic.facebook.com/checkpoint/?_rdr")
53
- request.Headers.Set("cache-control", "max-age=0")
54
- request.Headers.Set("upgrade-insecure-requests", "1")
55
- //accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
56
- //origin: https://mbasic.facebook.com
57
- //referer: https://mbasic.facebook.com/checkpoint/?_rdr
58
- request.Headers.Set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36")
59
- request.ResponseCharacterEncoding = "utf-8"
60
- }
61
-
62
45
  func setupSharedCollector(collector *colly.Collector) error {
63
46
  var err error
64
47
  extensions.Referer(collector)
65
48
  collector.AllowURLRevisit = true
66
- collector.OnRequest(sharedOnRequest)
67
- collector.OnResponse(sharedOnResponse)
49
+ var lastUrl string
50
+ collector.OnRequest(func(request *colly.Request) {
51
+ lastUrl = request.URL.RawPath
52
+ logger.Info("OnRequest ", request.URL)
53
+ //request.Headers.Set("Host", "facebook.com")
54
+ request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
55
+ request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
56
+ request.Headers.Set("origin", "https://mbasic.facebook.com")
57
+
58
+ //logger.Info("Saved referrer is", request.Ctx.Get("_referer"))
59
+ request.Headers.Set("referer", "https://mbasic.facebook.com/checkpoint/?_rdr")
60
+ request.Headers.Set("cache-control", "max-age=0")
61
+ request.Headers.Set("upgrade-insecure-requests", "1")
62
+ //accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
63
+ //origin: https://mbasic.facebook.com
64
+ //referer: https://mbasic.facebook.com/checkpoint/?_rdr
65
+ request.Headers.Set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36")
66
+ request.ResponseCharacterEncoding = "utf-8"
67
+ })
68
+ collector.OnResponse(func(response *colly.Response) {
69
+ logger.Info("OnResponse ./last.html")
70
+ _ = response.Save("./last.html")
71
+ //logger.Info(string(resp.Body))
72
+ })
73
+
74
+ collector.OnHTML("a[href*=\"177066345680802\"", func(element *colly.HTMLElement) {
75
+ logger.Error("RateLimit reached ", lastUrl)
76
+ })
68
77
  collector.OnError(func(resp *colly.Response, errHttp error) {
69
78
  err = errHttp
70
79
  logger.Error("OnError", err)
@@ -72,12 +81,6 @@ func setupSharedCollector(collector *colly.Collector) error {
72
81
  return err
73
82
  }
74
83
 
75
- func sharedOnResponse(response *colly.Response) {
76
- logger.Info("OnResponse ./last.html")
77
- _ = response.Save("./last.html")
78
- //logger.Info(string(resp.Body))
79
- }
80
-
81
84
  func getForm(element *colly.HTMLElement, err error) (string, error, map[string]string) {
82
85
  submitUrl, exists := element.DOM.Attr("action")
83
86
  if !exists {
@@ -106,7 +109,7 @@ func New() *Fbcolly {
106
109
  return &f
107
110
  }
108
111
 
109
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
112
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
110
113
  collector := f.collector.Clone()
111
114
  err := setupSharedCollector(collector)
112
115
 
@@ -155,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
155
158
  //logger.Info("Please input OTP")
156
159
  //reader := bufio.NewReader(os.Stdin)
157
160
  //code, _ := reader.ReadString('\n')
158
- code := otp[0:6]
159
- reqMap["approvals_code"] = code
160
- shouldSubmit = true
161
+ if len(totpSecret) > 0 {
162
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
163
+ reqMap["approvals_code"] = code
164
+ shouldSubmit = true
165
+ }
166
+
161
167
  } else {
162
168
  logger.Info("OnHTML Only Continue checkpoint")
163
169
 
@@ -191,22 +197,17 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
191
197
 
192
198
  }
193
199
 
194
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
200
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
195
201
  collector := f.collector.Clone()
196
202
  err := setupSharedCollector(collector)
197
- currentPage := 1
198
- var result []*fbcrawl.FacebookPost
203
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
199
204
 
200
205
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
201
- currentPage++
202
- if currentPage < 3 {
203
- logger.Info("Will fetch page", currentPage)
204
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
205
- }
206
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
206
207
  })
207
- collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
208
+ collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
208
209
  dataElement := element
209
- post := &fbcrawl.FacebookPost{}
210
+ post := &pb.FacebookPost{}
210
211
  var fbDataFt FbDataFt
211
212
  jsonData := dataElement.Attr("data-ft")
212
213
 
@@ -218,9 +219,10 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
218
219
  }
219
220
  logger.Info("Post ", fbDataFt)
220
221
  post.Id = fbDataFt.TopLevelPostId
221
- post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
222
- post.User = &fbcrawl.FacebookUser{
223
- Id: fbDataFt.ContentOwnerIdNew,
222
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
223
+ userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
224
+ post.User = &pb.FacebookUser{
225
+ Id: userId,
224
226
  Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
225
227
  }
226
228
  post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
@@ -239,77 +241,110 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
239
241
  post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
240
242
  post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
241
243
  post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
242
- post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
244
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
243
245
  i, _ := strconv.ParseInt(id, 10, 64)
244
- return &fbcrawl.FacebookImage{
246
+ return &pb.FacebookImage{
245
247
  Id: i,
246
248
  }
247
- })).([]*fbcrawl.FacebookImage)
249
+ })).([]*pb.FacebookImage)
248
250
 
249
251
  if fbDataFt.PhotoId > 0 {
250
- post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
252
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
251
253
  }
252
- result = append(result, post)
254
+ result.Posts = append(result.Posts, post)
255
+ })
256
+ if len(nextCursor) > 0 {
257
+ err = collector.Visit(nextCursor)
258
+ } else {
259
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
260
+ }
261
+
262
+ if err != nil {
263
+ logger.Error("crawl by colly err:", err)
264
+ }
265
+ return err, &result
266
+ }
267
+
268
+ func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
269
+ collector := f.collector.Clone()
270
+ err := setupSharedCollector(collector)
271
+
272
+ result := &pb.FacebookUser{}
273
+
274
+ collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
275
+ parsed, _ := url.Parse(element.Attr("href"))
276
+ result.Username = strings.Split(parsed.Path[1:], "/")[0]
277
+ result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
278
+ })
279
+
280
+ collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
281
+ result.FriendCount = getNumberFromText(element.Text)
282
+ })
283
+
284
+ collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
285
+ result.Name = element.DOM.Find("strong").First().Text()
253
286
  })
254
287
 
255
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
288
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
256
289
  if err != nil {
257
290
  logger.Error("crawl by colly err:", err)
258
291
  }
259
- return err, &fbcrawl.FacebookPostList{Posts: result}
292
+ return err, result
260
293
  }
261
294
 
262
- func (f *Fbcolly) FetchGroupInfo(groupId int64) (error, *fbcrawl.FacebookGroup) {
295
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
263
296
  collector := f.collector.Clone()
264
297
  err := setupSharedCollector(collector)
265
- result := &fbcrawl.FacebookGroup{Id: groupId}
298
+ result := &pb.FacebookGroup{}
266
299
 
267
300
  collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
268
301
  result.Name = element.Text
269
302
  })
270
303
  collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
271
- result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(),10,64)
304
+ result.Id = getNumberFromText(element.Attr("href"))
305
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
272
306
  })
273
307
 
274
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d?view=info", groupId))
308
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
275
309
  if err != nil {
276
310
  logger.Error("crawl by colly err:", err)
277
311
  }
278
312
  return err, result
279
313
  }
280
314
 
281
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
315
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
282
316
  collector := f.collector.Clone()
283
317
  err := setupSharedCollector(collector)
284
- currentPage := 1
285
- var result []*fbcrawl.FacebookImage
318
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
286
319
 
287
320
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
288
- currentPage++
289
- logger.Info("Will fetch page", currentPage)
290
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
321
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
291
322
  })
292
323
 
293
324
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
294
- result = append(result, &fbcrawl.FacebookImage{
325
+ result.Images = append(result.Images, &pb.FacebookImage{
295
326
  Id: getImageIdFromHref(element.Attr("href")),
296
327
  })
297
328
  //f.detailCollector.Visit(url)
298
329
  })
330
+ if len(nextCursor) > 0 {
331
+ err = collector.Visit(nextCursor)
332
+ } else {
333
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
334
+ }
299
335
 
300
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
301
336
  if err != nil {
302
337
  logger.Error("crawl by colly err:", err)
303
338
  }
304
- return err, &fbcrawl.FacebookImageList{Images: result}
339
+ return err, &result
305
340
  }
306
341
 
307
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
342
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
308
343
  collector := f.collector.Clone()
309
344
  err := setupSharedCollector(collector)
310
- result := fbcrawl.FacebookImage{Id: imageId}
345
+ result := pb.FacebookImage{Id: imageId}
311
346
 
312
- collector.OnHTML("a", func(element *colly.HTMLElement) {
347
+ collector.OnHTML("a[href*=\"fbcdn\"]", func(element *colly.HTMLElement) {
313
348
  result.Url = element.Attr("href")
314
349
  })
315
350
 
@@ -320,11 +355,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
320
355
  return err, &result
321
356
  }
322
357
 
323
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
358
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
324
359
  collector := f.collector.Clone()
325
360
  err := setupSharedCollector(collector)
326
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
327
- commentPaging := 0
361
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
328
362
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
329
363
  dataElement := element.DOM.Find("div[data-ft]")
330
364
  if dataElement.Length() > 0 {
@@ -339,9 +373,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
339
373
  }
340
374
  logger.Info("Post ", result)
341
375
  post.Id = result.TopLevelPostId
342
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
343
- post.User = &fbcrawl.FacebookUser{
344
- Id: result.ContentOwnerIdNew,
376
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
377
+ userId, _ := result.ContentOwnerIdNew.Int64()
378
+ post.User = &pb.FacebookUser{
379
+ Id: userId,
345
380
  Name: dataElement.Find("h3 strong:first-child a").Text(),
346
381
  }
347
382
  post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
@@ -359,15 +394,15 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
359
394
 
360
395
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
361
396
  post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
362
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
397
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
363
398
  i, _ := strconv.ParseInt(id, 10, 64)
364
- return &fbcrawl.FacebookImage{
399
+ return &pb.FacebookImage{
365
400
  Id: i,
366
401
  }
367
- })).([]*fbcrawl.FacebookImage)
402
+ })).([]*pb.FacebookImage)
368
403
 
369
404
  if result.PhotoId > 0 {
370
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
405
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
371
406
  }
372
407
  }
373
408
 
@@ -375,32 +410,48 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
375
410
  element.DOM.Find("h3 + div + div + div").Parent().Parent().Each(func(i int, selection *goquery.Selection) {
376
411
  //author
377
412
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
378
- logger.Info("comment", commentId)
379
- createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
380
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
381
- Id: commentId,
382
- Post: &fbcrawl.FacebookPost{Id: post.Id},
383
- User: &fbcrawl.FacebookUser{
384
- Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
385
- Name: selection.Find("h3 > a").Text(),
386
- },
387
- Content: selection.Find("h3 + div").Text(),
388
- CreatedAt: createdAtWhenResult.Time.Unix(),
389
- })
413
+ if commentId > 0 {
414
+ createdAtWhenResult, err := f.w.Parse(selection.Find("abbr").Text(), time.Now())
415
+ if err != nil {
416
+ logger.Error(err)
417
+ return
418
+ }
419
+ parsed, err := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
420
+ if err != nil {
421
+ logger.Error(err)
422
+ return
423
+ }
424
+ if len(parsed.Path) == 0 {
425
+ logger.Error("Empty path for commentId ", commentId)
426
+ return
427
+ }
428
+ if len(parsed.Path) > 1 {
429
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
430
+ Id: commentId,
431
+ Post: &pb.FacebookPost{Id: post.Id},
432
+ User: &pb.FacebookUser{
433
+ Username: parsed.Path[1:],
434
+ Name: selection.Find("h3 > a").Text(),
435
+ },
436
+ Content: selection.Find("h3 + div").Text(),
437
+ CreatedAt: createdAtWhenResult.Time.Unix(),
438
+ })
439
+ }
440
+ }
390
441
  })
391
442
 
392
443
  }
393
444
  })
394
445
 
395
446
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
396
- if commentPaging < 3 {
397
- logger.Info("Comment paging", commentPaging)
398
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
399
- commentPaging = commentPaging + 1
400
- }
447
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
401
448
  })
449
+ if len(commentNextCursor) > 0 {
450
+ err = collector.Visit(commentNextCursor)
451
+ } else {
452
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
453
+ }
402
454
 
403
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
404
455
  return err, post
405
456
  }
406
457
 
@@ -409,13 +460,36 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
409
460
  return collector.SetCookies("https://mbasic.facebook.com/", storage.UnstringifyCookies(cookies))
410
461
  }
411
462
 
463
+ func (f *Fbcolly) FetchMyGroups() (error, *pb.FacebookGroupList) {
464
+ collector := f.collector.Clone()
465
+ err := setupSharedCollector(collector)
466
+ result := &pb.FacebookGroupList{Groups: []*pb.FacebookGroup{}}
467
+
468
+ collector.OnHTML("li table a", func(element *colly.HTMLElement) {
469
+ result.Groups = append(result.Groups, &pb.FacebookGroup{
470
+ Id: getNumberFromText(element.Attr("href")),
471
+ Name: element.Text,
472
+ })
473
+ })
474
+
475
+ err = collector.Visit("https://mbasic.facebook.com/groups/?seemore")
476
+ if err != nil {
477
+ logger.Error("crawl by colly err:", err)
478
+ }
479
+ return err, result
480
+ }
481
+
412
482
  //func getUsernameFromHref(href string) string {
413
483
  // return regexp.MustCompile("/([\\d\\w.]+).*").FindStringSubmatch(href)[1]
414
484
  //}
415
485
 
416
486
  func getUserIdFromCommentHref(href string) int64 {
417
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
418
- return id
487
+ match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
488
+ if len(match) > 0 {
489
+ id, _ := strconv.ParseInt(match[1], 10, 64)
490
+ return id
491
+ }
492
+ return 0
419
493
  }
420
494
 
421
495
  func getUrlFromRedirectHref(href string) string {
@@ -430,17 +504,19 @@ func getImageIdFromHref(href string) int64 {
430
504
  }
431
505
 
432
506
  func getNumberFromText(text string) int64 {
433
- logger.Error("reaction", text)
507
+ logger.Info("getNumberFromText ", text)
434
508
  if len(text) > 0 {
435
- match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
436
- count, _ := strconv.ParseInt(match[1], 10, 64)
437
- switch match[2] {
438
- case "k":
439
- count *= 1000
440
- case "m":
441
- count *= 1000000
509
+ match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
510
+ if len(match) > 0 {
511
+ count, _ := strconv.ParseInt(match[1], 10, 64)
512
+ switch match[2] {
513
+ case "k":
514
+ count *= 1000
515
+ case "m":
516
+ count *= 1000000
517
+ }
518
+ return count
442
519
  }
443
- return count
444
520
  }
445
521
  return 0
446
522
  }