fbcrawl-colly 0.1.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 96d233862f21d1dff4a447d57f0f4a4ee6311506d7b5cf1990d75bf92043defa
4
- data.tar.gz: a238ee4a34eac5f33fa90bba0d5bdb22169bd0b54034682f8fbb25c1ca4de039
3
+ metadata.gz: 0fc0e07942d352bb9b49c93a106d138bc9946f8e1943bcd81f6181082e79c413
4
+ data.tar.gz: 1e31f7fe0bc3bf83c82b90d84080a257734cfe63eaa4d79fdb89aa52204e2eb4
5
5
  SHA512:
6
- metadata.gz: 36d58d4ef38bf94164f94fe0a841171cc5bdd12a4f2503e818858d77cae029e1fc79cddd3213b2bb9e130e70833703df97ad18700d92c2cc3e31401b8e1f1443
7
- data.tar.gz: 5c56a623910f87369812c0d74908e618490b4aeead9d22b055f8076ca040ba9fe5fff0bb380c44e7e435e0d83a13823b259b0f6c57233d7fb3ec38ff83de0832
6
+ metadata.gz: 55d00db7f51b078c1ca7c46a59b06eb4d62b1859fac6e11a2da3c117f7c2dd3958d442559f9b84573f4d4f5e2d02a77539287d5628ff60f421405a3e2910e2b7
7
+ data.tar.gz: 550b26405d7bbd13356f1ca7ef30edfa682c04364f2f8fc409269c74fe25e8cb0535464d395083109b04e17de129bd851da3f035dd2dec6af2095ed357a1ddd7
data/.gitignore CHANGED
@@ -14,3 +14,6 @@ last.html
14
14
  *.db
15
15
  /fbcrawl/fbcrawl.pb.go
16
16
  /lib/fbcrawl_pb.rb
17
+
18
+ mkmf.log
19
+ .rakeTasks
@@ -0,0 +1,28 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ fbcrawl-colly (0.2.4)
5
+ ffi
6
+ google-protobuf
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ffi (1.13.1)
12
+ google-protobuf (3.13.0)
13
+ minitest (5.14.1)
14
+ rake (12.3.3)
15
+ rake-compiler (1.1.1)
16
+ rake
17
+
18
+ PLATFORMS
19
+ ruby
20
+
21
+ DEPENDENCIES
22
+ fbcrawl-colly!
23
+ minitest (~> 5.0)
24
+ rake (~> 12.0)
25
+ rake-compiler
26
+
27
+ BUNDLED WITH
28
+ 2.1.4
data/Rakefile CHANGED
@@ -7,4 +7,13 @@ Rake::TestTask.new(:test) do |t|
7
7
  t.test_files = FileList["test/**/*_test.rb"]
8
8
  end
9
9
 
10
+ task :fbcrawl_colly do
11
+ Dir.chdir("./ext/fbcrawl_colly/") do
12
+ require './extconf'
13
+ `make`
14
+ end
15
+ end
16
+
17
+ task :compile => [:fbcrawl_colly]
18
+ task :test => :compile
10
19
  task :default => :test
@@ -1,5 +1,6 @@
1
1
  require 'mkmf'
2
- MakeMakefile::find_executable 'go'
3
- MakeMakefile::find_executable 'protoc'
4
- MakeMakefile::find_executable 'protoc-gen-go'
5
- $makefile_created = true
2
+ requirement_passed = true
3
+ requirement_passed &&= MakeMakefile::find_executable 'go'
4
+ requirement_passed &&= MakeMakefile::find_executable 'protoc'
5
+ requirement_passed &&= MakeMakefile::find_executable 'protoc-gen-go'
6
+ $makefile_created = requirement_passed
@@ -10,14 +10,35 @@ import (
10
10
  "github.com/gocolly/colly/extensions"
11
11
  "github.com/gocolly/colly/storage"
12
12
  "github.com/google/logger"
13
+ "github.com/olebedev/when"
14
+ "github.com/olebedev/when/rules/common"
15
+ "github.com/olebedev/when/rules/en"
16
+ "github.com/thoas/go-funk"
13
17
  "net/url"
14
18
  "qnetwork.net/fbcrawl/fbcrawl"
15
19
  "regexp"
20
+ "strconv"
16
21
  "strings"
22
+ "time"
17
23
  )
18
24
 
19
25
  type Fbcolly struct {
20
26
  collector *colly.Collector
27
+ w *when.Parser
28
+ }
29
+ type FbDataPostContext struct {
30
+ PublishTime int64 `json:"publish_time"`
31
+ }
32
+ type FbDataInsight struct {
33
+ FbDataPostContext `json:"post_context"`
34
+ }
35
+ type FbDataFt struct {
36
+ ContentOwnerIdNew int64 `json:"content_owner_id_new"`
37
+ PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
+ PhotoId int64 `json:"photo_id,string"`
39
+ PageId int64 `json:"page_id,string"`
40
+ TopLevelPostId int64 `json:"top_level_post_id,string"`
41
+ PageInsights map[string]FbDataInsight `json:"page_insights"`
21
42
  }
22
43
 
23
44
  func sharedOnRequest(request *colly.Request) {
@@ -41,7 +62,7 @@ func sharedOnRequest(request *colly.Request) {
41
62
  func setupSharedCollector(collector *colly.Collector) error {
42
63
  var err error
43
64
  extensions.Referer(collector)
44
-
65
+ collector.AllowURLRevisit = true
45
66
  collector.OnRequest(sharedOnRequest)
46
67
  collector.OnResponse(sharedOnResponse)
47
68
  collector.OnError(func(resp *colly.Response, errHttp error) {
@@ -79,39 +100,46 @@ func getForm(element *colly.HTMLElement, err error) (string, error, map[string]s
79
100
  func New() *Fbcolly {
80
101
  f := Fbcolly{}
81
102
  f.collector = colly.NewCollector()
103
+ f.w = when.New(nil)
104
+ f.w.Add(en.All...)
105
+ f.w.Add(common.All...)
82
106
  return &f
83
107
  }
84
108
 
85
- func (f *Fbcolly) Login(email string, password string, otp string) error {
109
+ func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
86
110
  collector := f.collector.Clone()
87
- setupSharedCollector(collector)
111
+ err := setupSharedCollector(collector)
88
112
 
89
113
  logger.Info("Login using email", email)
90
-
91
- var err error
114
+ loggedIn := false
115
+ firstLogin := true
92
116
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
93
- logger.Info("OnHTML login_form")
94
- loginURL, err, reqMap := getForm(element, err)
95
- if err != nil {
96
- logger.Error(err)
97
- return
98
- }
99
- reqMap["email"] = email
100
- reqMap["pass"] = password
101
- logger.Info("req map:", reqMap)
102
- err = collector.Post(loginURL, reqMap)
103
- if err != nil {
104
- logger.Error("post err:", err)
117
+ if firstLogin {
118
+ firstLogin = false
119
+ logger.Info("OnHTML login_form")
120
+ loginURL, err, reqMap := getForm(element, err)
121
+ if err != nil {
122
+ logger.Error(err)
123
+ return
124
+ }
125
+ reqMap["email"] = email
126
+ reqMap["pass"] = password
127
+ logger.Info("req map:", reqMap)
128
+ err = collector.Post(loginURL, reqMap)
129
+ if err != nil {
130
+ logger.Error("post err:", err)
131
+ }
105
132
  }
106
133
  })
107
134
 
108
135
  collector.OnHTML("a[href=\"/login/save-device/cancel/?flow=interstitial_nux&nux_source=regular_login\"]", func(element *colly.HTMLElement) {
109
- collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
136
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
110
137
  })
111
138
 
112
139
  collector.OnHTML("form[action=\"/login/checkpoint/\"]", func(element *colly.HTMLElement) {
113
140
 
114
141
  checkpointUrl, err, reqMap := getForm(element, err)
142
+ shouldSubmit := false
115
143
  if err != nil {
116
144
  logger.Error(err)
117
145
  return
@@ -121,6 +149,7 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
121
149
  //Save Device
122
150
  logger.Info("OnHTML Save Device checkpoint")
123
151
  reqMap["name_action_selected"] = "dont_save"
152
+ shouldSubmit = true
124
153
  } else if element.DOM.Find("input[name=\"approvals_code\"]").Length() > 0 {
125
154
  logger.Info("OnHTML OTP checkpoint")
126
155
  //logger.Info("Please input OTP")
@@ -128,11 +157,15 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
128
157
  //code, _ := reader.ReadString('\n')
129
158
  code := otp[0:6]
130
159
  reqMap["approvals_code"] = code
160
+ shouldSubmit = true
131
161
  } else {
132
162
  logger.Info("OnHTML Only Continue checkpoint")
163
+
164
+ }
165
+ if shouldSubmit {
166
+ logger.Info("req map:", reqMap)
167
+ err = collector.Post(checkpointUrl, reqMap)
133
168
  }
134
- logger.Info("req map:", reqMap)
135
- err = collector.Post(checkpointUrl, reqMap)
136
169
  if err != nil {
137
170
  logger.Error("post err:", err)
138
171
  }
@@ -141,19 +174,24 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
141
174
  collector.OnHTML("form[action=\"/search/\"]", func(element *colly.HTMLElement) {
142
175
  //We're in home
143
176
  logger.Info("I'm IN HOME, navigate to page now")
177
+ loggedIn = true
144
178
  })
145
179
 
146
180
  err = collector.Visit("https://mbasic.facebook.com/")
147
181
  if err != nil {
148
182
  logger.Error("crawl by colly err:", err)
149
183
  }
150
- logger.Info(storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")))
151
- //return err, storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/"))
152
- //return err, collector.getS.Cookies("https://mbasic.facebook.com/")
153
- return err
184
+
185
+ if loggedIn {
186
+ logger.Info(storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")))
187
+ return storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")), err
188
+ } else {
189
+ return "", err
190
+ }
191
+
154
192
  }
155
193
 
156
- func (f *Fbcolly) FetchGroupFeed(groupId string) (error, *fbcrawl.FacebookPostList) {
194
+ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
157
195
  collector := f.collector.Clone()
158
196
  err := setupSharedCollector(collector)
159
197
  currentPage := 1
@@ -163,72 +201,246 @@ func (f *Fbcolly) FetchGroupFeed(groupId string) (error, *fbcrawl.FacebookPostLi
163
201
  currentPage++
164
202
  if currentPage < 3 {
165
203
  logger.Info("Will fetch page", currentPage)
166
- collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
204
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
205
+ }
206
+ })
207
+ collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
208
+ dataElement := element
209
+ post := &fbcrawl.FacebookPost{}
210
+ var fbDataFt FbDataFt
211
+ jsonData := dataElement.Attr("data-ft")
212
+
213
+ logger.Info(jsonData)
214
+ err = json.Unmarshal([]byte(jsonData), &fbDataFt)
215
+ if err != nil {
216
+ logger.Error(err)
217
+ return
218
+ }
219
+ logger.Info("Post ", fbDataFt)
220
+ post.Id = fbDataFt.TopLevelPostId
221
+ post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
222
+ post.User = &fbcrawl.FacebookUser{
223
+ Id: fbDataFt.ContentOwnerIdNew,
224
+ Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
225
+ }
226
+ post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
227
+ //Content
228
+
229
+ //NO BACKGROUND TEXT ONLY
230
+ post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
231
+ return selection.Text()
232
+ }), "\n")
233
+
234
+ if len(post.Content) == 0 {
235
+ // TEXT WITH BACKGROUND
236
+ post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
237
+ }
238
+
239
+ post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
240
+ post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
241
+ post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
242
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
243
+ i, _ := strconv.ParseInt(id, 10, 64)
244
+ return &fbcrawl.FacebookImage{
245
+ Id: i,
246
+ }
247
+ })).([]*fbcrawl.FacebookImage)
248
+
249
+ if fbDataFt.PhotoId > 0 {
250
+ post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
167
251
  }
252
+ result = append(result, post)
253
+ })
254
+
255
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
256
+ if err != nil {
257
+ logger.Error("crawl by colly err:", err)
258
+ }
259
+ return err, &fbcrawl.FacebookPostList{Posts: result}
260
+ }
261
+
262
+ func (f *Fbcolly) FetchGroupInfo(groupId int64) (error, *fbcrawl.FacebookGroup) {
263
+ collector := f.collector.Clone()
264
+ err := setupSharedCollector(collector)
265
+ result := &fbcrawl.FacebookGroup{Id: groupId}
266
+
267
+ collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
268
+ result.Name = element.Text
168
269
  })
270
+ collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
271
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(),10,64)
272
+ })
273
+
274
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d?view=info", groupId))
275
+ if err != nil {
276
+ logger.Error("crawl by colly err:", err)
277
+ }
278
+ return err, result
279
+ }
169
280
 
170
- //TODO: May not need this
171
- collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
172
- u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
281
+ func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
282
+ collector := f.collector.Clone()
283
+ err := setupSharedCollector(collector)
284
+ currentPage := 1
285
+ var result []*fbcrawl.FacebookImage
286
+
287
+ collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
288
+ currentPage++
289
+ logger.Info("Will fetch page", currentPage)
290
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
291
+ })
173
292
 
174
- result = append(result, &fbcrawl.FacebookPost{
175
- Id: u.Query().Get("id"),
176
- Group: &fbcrawl.FacebookGroup{Id: groupId},
293
+ collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
294
+ result = append(result, &fbcrawl.FacebookImage{
295
+ Id: getImageIdFromHref(element.Attr("href")),
177
296
  })
178
297
  //f.detailCollector.Visit(url)
179
298
  })
180
299
 
181
- err = collector.Visit("https://mbasic.facebook.com/groups/" + groupId)
300
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
182
301
  if err != nil {
183
302
  logger.Error("crawl by colly err:", err)
184
303
  }
185
- return err, &fbcrawl.FacebookPostList{Posts: result}
304
+ return err, &fbcrawl.FacebookImageList{Images: result}
305
+ }
306
+
307
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
308
+ collector := f.collector.Clone()
309
+ err := setupSharedCollector(collector)
310
+ result := fbcrawl.FacebookImage{Id: imageId}
311
+
312
+ collector.OnHTML("a", func(element *colly.HTMLElement) {
313
+ result.Url = element.Attr("href")
314
+ })
315
+
316
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/photo/view_full_size/?fbid=%d", imageId))
317
+ if err != nil {
318
+ logger.Error("crawl by colly err:", err)
319
+ }
320
+ return err, &result
186
321
  }
187
322
 
188
- func (f *Fbcolly) FetchPost(groupId string, postId string) (error, *fbcrawl.FacebookPost) {
323
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
189
324
  collector := f.collector.Clone()
190
325
  err := setupSharedCollector(collector)
191
- post := &fbcrawl.FacebookPost{}
326
+ post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
327
+ commentPaging := 0
192
328
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
193
329
  dataElement := element.DOM.Find("div[data-ft]")
194
330
  if dataElement.Length() > 0 {
195
- var result map[string]string
331
+ var result FbDataFt
196
332
  jsonData, isExist := dataElement.Attr("data-ft")
197
333
  if isExist {
198
- json.Unmarshal([]byte(jsonData), &result)
334
+ logger.Info(jsonData)
335
+ err = json.Unmarshal([]byte(jsonData), &result)
336
+ if err != nil {
337
+ logger.Error(err)
338
+ return
339
+ }
199
340
  logger.Info("Post ", result)
200
- post.Id = result["top_level_post_id"]
201
- post.Group = &fbcrawl.FacebookGroup{Id: result["page_id"], Name: dataElement.Find("h3 strong:last-child a").Text()}
202
- post.User = &fbcrawl.FacebookUser{Id: result["content_owner_id_new"], Name: dataElement.Find("h3 strong:first-child a").Text()}
341
+ post.Id = result.TopLevelPostId
342
+ post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
343
+ post.User = &fbcrawl.FacebookUser{
344
+ Id: result.ContentOwnerIdNew,
345
+ Name: dataElement.Find("h3 strong:first-child a").Text(),
346
+ }
347
+ post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
203
348
  //Content
349
+
350
+ //NO BACKGROUND TEXT ONLY
204
351
  post.Content = strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
205
352
  return selection.Text()
206
353
  }), "\n")
207
354
 
208
- logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
209
- return selection.Text()
210
- }), "\n"))
355
+ if len(post.Content) == 0 {
356
+ // TEXT WITH BACKGROUND
357
+ post.Content = dataElement.Find("div[style*=\"background-image:url\"]").Text()
358
+ }
359
+
360
+ post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
361
+ post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
362
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
363
+ i, _ := strconv.ParseInt(id, 10, 64)
364
+ return &fbcrawl.FacebookImage{
365
+ Id: i,
366
+ }
367
+ })).([]*fbcrawl.FacebookImage)
368
+
369
+ if result.PhotoId > 0 {
370
+ post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
371
+ }
211
372
  }
212
- post.Comments = []*fbcrawl.FacebookComment{}
373
+
213
374
  //Comment
214
375
  element.DOM.Find("h3 + div + div + div").Parent().Parent().Each(func(i int, selection *goquery.Selection) {
215
376
  //author
216
- commentId := selection.AttrOr("id", "")
377
+ commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
217
378
  logger.Info("comment", commentId)
218
- //idRegex, _ := regexp.Compile("")
219
- //idRegex.FindString()
379
+ createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
220
380
  post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
221
381
  Id: commentId,
222
382
  Post: &fbcrawl.FacebookPost{Id: post.Id},
223
383
  User: &fbcrawl.FacebookUser{
224
- Id: regexp.MustCompile("/([\\d\\w.]+)").FindStringSubmatch(selection.Find("h3 > a").AttrOr("href", ""))[1],
384
+ Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
225
385
  Name: selection.Find("h3 > a").Text(),
226
386
  },
227
- Content: selection.Find("h3 + div").Text(),
387
+ Content: selection.Find("h3 + div").Text(),
388
+ CreatedAt: createdAtWhenResult.Time.Unix(),
228
389
  })
229
390
  })
391
+
230
392
  }
231
393
  })
232
- collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%s?view=permalink&id=%s&_rdr", groupId, postId))
394
+
395
+ collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
396
+ if commentPaging < 3 {
397
+ logger.Info("Comment paging", commentPaging)
398
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
399
+ commentPaging = commentPaging + 1
400
+ }
401
+ })
402
+
403
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
233
404
  return err, post
234
405
  }
406
+
407
+ func (f *Fbcolly) LoginWithCookies(cookies string) error {
408
+ collector := f.collector
409
+ return collector.SetCookies("https://mbasic.facebook.com/", storage.UnstringifyCookies(cookies))
410
+ }
411
+
412
+ //func getUsernameFromHref(href string) string {
413
+ // return regexp.MustCompile("/([\\d\\w.]+).*").FindStringSubmatch(href)[1]
414
+ //}
415
+
416
+ func getUserIdFromCommentHref(href string) int64 {
417
+ id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
418
+ return id
419
+ }
420
+
421
+ func getUrlFromRedirectHref(href string) string {
422
+ u, _ := url.Parse(href)
423
+ return u.Query().Get("u")
424
+ }
425
+
426
+ func getImageIdFromHref(href string) int64 {
427
+ u, _ := url.Parse(href)
428
+ i, _ := strconv.ParseInt(u.Query().Get("fbid"), 10, 64)
429
+ return i
430
+ }
431
+
432
+ func getNumberFromText(text string) int64 {
433
+ logger.Error("reaction", text)
434
+ if len(text) > 0 {
435
+ match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
436
+ count, _ := strconv.ParseInt(match[1], 10, 64)
437
+ switch match[2] {
438
+ case "k":
439
+ count *= 1000
440
+ case "m":
441
+ count *= 1000000
442
+ }
443
+ return count
444
+ }
445
+ return 0
446
+ }
@@ -1,4 +1,4 @@
1
- require_relative 'lib/fbcrawl-colly/version'
1
+ require_relative 'lib/fbcrawl_colly/version'
2
2
 
3
3
  Gem::Specification.new do |spec|
4
4
  spec.name = "fbcrawl-colly"
@@ -32,4 +32,5 @@ Gem::Specification.new do |spec|
32
32
 
33
33
  spec.add_runtime_dependency 'ffi'
34
34
  spec.add_runtime_dependency 'google-protobuf'
35
+ spec.add_development_dependency 'rake-compiler'
35
36
  end
@@ -1,33 +1,51 @@
1
1
  syntax = "proto3";
2
2
 
3
+ package fbcrawl_colly;
3
4
  option go_package = "./fbcrawl;fbcrawl";
4
5
 
5
6
  // The request message containing the user's name.
6
7
  message FacebookGroup {
7
- string id = 1;
8
+ int64 id = 1;
8
9
  string name = 2;
10
+ int64 member_count = 3;
9
11
  }
10
12
 
11
13
  message FacebookUser {
12
- string id = 1;
14
+ int64 id = 1;
13
15
  string name = 2;
14
16
  }
15
17
 
16
18
  message FacebookPost {
17
- string id = 1;
19
+ int64 id = 1;
18
20
  FacebookGroup group = 2;
19
21
  FacebookUser user = 3;
20
22
  string content = 4;
23
+ string content_link = 6;
24
+ FacebookImage content_image = 8;
25
+ repeated FacebookImage content_images = 7;
21
26
  repeated FacebookComment comments = 5;
27
+ int64 created_at = 9;
28
+ int64 reaction_count = 10;
29
+ int64 comment_count = 11;
30
+ }
31
+
32
+ message FacebookImage {
33
+ int64 id = 1;
34
+ string url = 2;
22
35
  }
23
36
 
24
37
  message FacebookComment {
25
- string id = 1;
38
+ int64 id = 1;
26
39
  FacebookPost post = 2;
27
40
  FacebookUser user = 3;
28
41
  string content = 4;
42
+ int64 created_at = 5;
29
43
  }
30
44
 
31
45
  message FacebookPostList {
32
46
  repeated FacebookPost posts = 1;
33
47
  }
48
+
49
+ message FacebookImageList {
50
+ repeated FacebookImage images = 1;
51
+ }
data/go.mod CHANGED
@@ -13,6 +13,7 @@ require (
13
13
  github.com/golang/protobuf v1.4.2
14
14
  github.com/google/logger v1.1.0
15
15
  github.com/kennygrant/sanitize v1.2.4 // indirect
16
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254
16
17
  github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
17
18
  github.com/temoto/robotstxt v1.1.1 // indirect
18
19
  github.com/thoas/go-funk v0.7.0
data/go.sum CHANGED
@@ -1,4 +1,6 @@
1
1
  cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
2
+ github.com/AlekSi/pointer v1.0.0 h1:KWCWzsvFxNLcmM5XmiqHsGTTsuwZMsLFwWF9Y+//bNE=
3
+ github.com/AlekSi/pointer v1.0.0/go.mod h1:1kjywbfcPFCmncIxtk6fIEub6LKrfMz3gc5QKVOSOA8=
2
4
  github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
3
5
  github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
4
6
  github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
@@ -18,6 +20,8 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
18
20
  github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
19
21
  github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
20
22
  github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
23
+ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
24
+ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
21
25
  github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
22
26
  github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
23
27
  github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
@@ -50,6 +54,10 @@ github.com/google/logger v1.1.0 h1:saB74Etb4EAJNH3z74CVbCKk75hld/8T0CsXKetWCwM=
50
54
  github.com/google/logger v1.1.0/go.mod h1:w7O8nrRr0xufejBlQMI83MXqRusvREoJdaAxV+CoAB4=
51
55
  github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
52
56
  github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
57
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254 h1:JYoQR67E1vv1WGoeW8DkdFs7vrIEe/5wP+qJItd5tUE=
58
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254/go.mod h1:DPucAeQGDPUzYUt+NaWw6qsF5SFapWWToxEiVDh2aV0=
59
+ github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
60
+ github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
53
61
  github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
54
62
  github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
55
63
  github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
@@ -1,16 +1,4 @@
1
- require 'ffi'
2
- require 'fbcrawl_pb'
3
- module FbcrawlColly
4
- extend FFI::Library
5
1
 
6
- ffi_lib File.expand_path("../ext/fbcrawl_colly/fbcolly.so", File.dirname(__FILE__))
7
- attach_function :free, [ :pointer ], :void
2
+ module FbcrawlColly
8
3
 
9
- attach_function :Init, [], :pointer
10
- attach_function :Login, [:pointer, :string, :string], :void
11
- attach_function :FetchGroupFeed, [:pointer, :string], :string
12
- attach_function :FetchPost, [:pointer, :string, :string], :string
13
- # attach_function :FetchGroup, [:pointer, :string], :pointer
14
- # attach_function :Login, [:pointer, :string, :string, :string], :void
15
- # attach_function :FreePointer, [:pointer], :void
16
4
  end
@@ -0,0 +1,57 @@
1
+ require 'ffi'
2
+ require_relative '../fbcrawl_pb'
3
+ require_relative './ffi'
4
+
5
+ module FbcrawlColly
6
+ class Colly
7
+ def initialize
8
+ super
9
+ @colly = ::FFI::AutoPointer.new(FbcrawlColly::FFI::Init(), FbcrawlColly::FFI.method(:FreeColly))
10
+ end
11
+
12
+ def login(email, password)
13
+ s, ptr = FbcrawlColly::FFI.Login(@colly, email, password)
14
+ FbcrawlColly::FFI.free(ptr)
15
+ s
16
+ end
17
+
18
+ def login_with_cookies(cookies)
19
+ FbcrawlColly::FFI.LoginWithCookies(@colly, cookies)
20
+ end
21
+
22
+ def fetch_group_info(group_id)
23
+ s, ptr = FbcrawlColly::FFI.FetchGroupInfo(@colly, group_id)
24
+ list = FbcrawlColly::FacebookGroup.decode(s)
25
+ FbcrawlColly::FFI.free(ptr)
26
+ list
27
+ end
28
+
29
+ def fetch_group_feed(group_id)
30
+ s, ptr = FbcrawlColly::FFI.FetchGroupFeed(@colly, group_id)
31
+ list = FbcrawlColly::FacebookPostList.decode(s)
32
+ FbcrawlColly::FFI.free(ptr)
33
+ list
34
+ end
35
+
36
+ def fetch_post(group_id, post_id)
37
+ s, ptr = FbcrawlColly::FFI.FetchPost(@colly, group_id, post_id)
38
+ post = FbcrawlColly::FacebookPost.decode(s)
39
+ FbcrawlColly::FFI.free(ptr)
40
+ post
41
+ end
42
+
43
+ def fetch_content_images(post_id)
44
+ s, ptr = FbcrawlColly::FFI.FetchContentImages(@colly, post_id)
45
+ imageList = FbcrawlColly::FacebookImageList.decode(s)
46
+ FbcrawlColly::FFI.free(ptr)
47
+ imageList
48
+ end
49
+
50
+ def fetch_image_url(image_id)
51
+ s, ptr = FbcrawlColly::FFI.FetchImageUrl(@colly, image_id)
52
+ image = FbcrawlColly::FacebookImage.decode(s)
53
+ FbcrawlColly::FFI.free(ptr)
54
+ image
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,18 @@
1
+ require 'ffi'
2
+ module FbcrawlColly::FFI
3
+ extend FFI::Library
4
+
5
+ ffi_lib File.expand_path("../../ext/fbcrawl_colly/fbcolly.so", File.dirname(__FILE__))
6
+ attach_function :free, [ :pointer ], :void
7
+
8
+ attach_function :Init, [], :pointer
9
+ attach_function :FreeColly, [:pointer], :pointer
10
+ attach_function :Login, [:pointer, :string, :string], :strptr
11
+ attach_function :LoginWithCookies, [:pointer, :string], :void
12
+ attach_function :FetchGroupInfo, [:pointer, :int64], :strptr
13
+ attach_function :FetchGroupFeed, [:pointer, :int64], :strptr
14
+ attach_function :FetchPost, [:pointer, :int64, :int64], :strptr
15
+ attach_function :FetchContentImages, [:pointer, :int64], :strptr
16
+ attach_function :FetchImageUrl, [:pointer, :int64], :strptr
17
+ # attach_function :FetchGroup, [:pointer, :string], :pointer
18
+ end
@@ -1,3 +1,3 @@
1
1
  module FbcrawlColly
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.4"
3
3
  end
data/main.go CHANGED
@@ -27,36 +27,76 @@ var password = flag.String("password", "change_me", "facebook password")
27
27
  var otp = flag.String("otp", "123456", "facebook otp")
28
28
  var groupId = flag.String("groupId", "334294967318328", "facebook group id, default is 334294967318328")
29
29
 
30
- var tmp = fbcolly.New()
30
+ var allInstances = map[uintptr]*fbcolly.Fbcolly{}
31
31
 
32
32
  //export Init
33
33
  func Init() uintptr {
34
- return (uintptr)(unsafe.Pointer(tmp))
34
+ instance := fbcolly.New()
35
+ ptr := (uintptr)(unsafe.Pointer(instance))
36
+ allInstances[ptr] = instance
37
+ return ptr
38
+ }
39
+
40
+ //export FreeColly
41
+ func FreeColly(pointer unsafe.Pointer) {
42
+ delete(allInstances, uintptr(pointer))
35
43
  }
36
44
 
37
45
  //export Login
38
- func Login(pointer unsafe.Pointer, email *C.char, password *C.char) {
46
+ func Login(pointer unsafe.Pointer, email *C.char, password *C.char) *C.char {
39
47
  p := (*fbcolly.Fbcolly)(pointer)
40
- //print(p.E)
41
- p.Login(C.GoString(email), C.GoString(password), "")
48
+ cookies, err := p.Login(C.GoString(email), C.GoString(password), "")
49
+ if err == nil {
50
+ return C.CString(cookies)
51
+ }
52
+ return nil
42
53
  }
43
54
 
55
+ //export LoginWithCookies
56
+ func LoginWithCookies(pointer unsafe.Pointer, cookies *C.char) {
57
+ p := (*fbcolly.Fbcolly)(pointer)
58
+ p.LoginWithCookies(C.GoString(cookies))
59
+ }
60
+
61
+ //export FetchGroupInfo
62
+ func FetchGroupInfo(pointer unsafe.Pointer, groupId int64) unsafe.Pointer {
63
+ p := (*fbcolly.Fbcolly)(pointer)
64
+ _, groupInfo := p.FetchGroupInfo(groupId)
65
+ marshaled, _ := proto.Marshal(groupInfo)
66
+ return C.CBytes(append(marshaled, 0))
67
+ }
44
68
  //export FetchGroupFeed
45
- func FetchGroupFeed(pointer unsafe.Pointer, groupId *C.char) unsafe.Pointer {
69
+ func FetchGroupFeed(pointer unsafe.Pointer, groupId int64) unsafe.Pointer {
46
70
  p := (*fbcolly.Fbcolly)(pointer)
47
- _, postsList := p.FetchGroupFeed(C.GoString(groupId))
71
+ _, postsList := p.FetchGroupFeed(groupId)
48
72
  marshaledPostsList, _ := proto.Marshal(postsList)
49
73
  return C.CBytes(append(marshaledPostsList, 0))
50
74
  }
51
75
 
52
76
  //export FetchPost
53
- func FetchPost(pointer unsafe.Pointer, groupId *C.char, postId *C.char) unsafe.Pointer {
77
+ func FetchPost(pointer unsafe.Pointer, groupId int64, postId int64) unsafe.Pointer {
54
78
  p := (*fbcolly.Fbcolly)(pointer)
55
- _, post := p.FetchPost(C.GoString(groupId), C.GoString(postId))
79
+ _, post := p.FetchPost(groupId, postId)
56
80
  marshaledPost, _ := proto.Marshal(post)
57
81
  return C.CBytes(append(marshaledPost, 0))
58
82
  }
59
83
 
84
+ //export FetchContentImages
85
+ func FetchContentImages(pointer unsafe.Pointer, postId int64) unsafe.Pointer {
86
+ p := (*fbcolly.Fbcolly)(pointer)
87
+ _, imageList := p.FetchContentImages(postId)
88
+ marshaled, _ := proto.Marshal(imageList)
89
+ return C.CBytes(append(marshaled, 0))
90
+ }
91
+
92
+ //export FetchImageUrl
93
+ func FetchImageUrl(pointer unsafe.Pointer, imageId int64) unsafe.Pointer {
94
+ p := (*fbcolly.Fbcolly)(pointer)
95
+ _, image := p.FetchImageUrl(imageId)
96
+ marshaled, _ := proto.Marshal(image)
97
+ return C.CBytes(append(marshaled, 0))
98
+ }
99
+
60
100
  func main() {
61
101
  //r := regexp.MustCompile("/([\\d\\w.]+)").FindStringSubmatch()[1]
62
102
  //print(r.FindStringSubmatch("/liem.phamthanh.161?refid=18&__tn__=R")[1])
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fbcrawl-colly
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duy Le
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-05 00:00:00.000000000 Z
11
+ date: 2020-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: Crawl mbasic.facebook.com using GO Colly
42
56
  email:
43
57
  - duyleekun@gmail.com
@@ -47,9 +61,9 @@ extensions:
47
61
  extra_rdoc_files: []
48
62
  files:
49
63
  - ".gitignore"
50
- - ".travis.yml"
51
64
  - CODE_OF_CONDUCT.md
52
65
  - Gemfile
66
+ - Gemfile.lock
53
67
  - LICENSE.txt
54
68
  - README.md
55
69
  - Rakefile
@@ -64,7 +78,9 @@ files:
64
78
  - go.mod
65
79
  - go.sum
66
80
  - lib/fbcrawl-colly.rb
67
- - lib/fbcrawl-colly/version.rb
81
+ - lib/fbcrawl_colly/colly.rb
82
+ - lib/fbcrawl_colly/ffi.rb
83
+ - lib/fbcrawl_colly/version.rb
68
84
  - main.go
69
85
  homepage: http://github.com/duyleekun/fbcrawl-colly
70
86
  licenses:
@@ -1,6 +0,0 @@
1
- ---
2
- language: ruby
3
- cache: bundler
4
- rvm:
5
- - 2.7.1
6
- before_install: gem install bundler -v 2.1.4