fbcrawl-colly 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 383de209c502265a8533af201e3b9538edffc115af1575bf9662508abb142e00
4
- data.tar.gz: 631c53a8c82e063df262ccac4f6c74f4591296b700ffa627110a9bf0dce40358
3
+ metadata.gz: 6e43a203c1c38131f0d9eb32b67a1859bd1530aac9249751ff0329a5901388f6
4
+ data.tar.gz: f3520762f00d2b1a2d475a9b97d2141fe8cd98faf718ccf3d8a797e61489f976
5
5
  SHA512:
6
- metadata.gz: b8c1505f7183c0a30a6d6679319a947be589828771e3bc231915cb841aad5e79b7221091862f6df537a701eaa2bf21bb996ba689e301d00a79964fd47e45f258
7
- data.tar.gz: 2ce2e0870f9455cdaeab48c840e1c9f5fb35afc340e0c56830afca90eb7bcebcf120430b7c3d043d58274d545ab0c7bb99d4062b36885b1e832bb540b7bf5080
6
+ metadata.gz: 2560aa6a3239671a14f82a95034bbad03684dd44b0fc32cb2092b404678b2ec0e124a33db64a92f5c22f4cd3d9d94a42ccc40a6ab2e1e362088d0f02f05d22f3
7
+ data.tar.gz: 305f3d463d75bc3edcfcd1b5a985d9a9882e990e850786132eb1b6d4e93f4e3e31a2c0e9aeb23141cc0633cd1d5ad8e1d88ec4c0e3e4d2d201a6298a7c8699c6
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.2)
4
+ fbcrawl-colly (0.2.3)
5
5
  ffi
6
6
  google-protobuf
7
7
 
@@ -201,16 +201,52 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
201
201
  err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
202
202
  }
203
203
  })
204
+ collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
205
+ dataElement := element
206
+ post := &fbcrawl.FacebookPost{}
207
+ var fbDataFt FbDataFt
208
+ jsonData := dataElement.Attr("data-ft")
209
+
210
+ logger.Info(jsonData)
211
+ err = json.Unmarshal([]byte(jsonData), &fbDataFt)
212
+ if err != nil {
213
+ logger.Error(err)
214
+ return
215
+ }
216
+ logger.Info("Post ", fbDataFt)
217
+ post.Id = fbDataFt.TopLevelPostId
218
+ post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
219
+ post.User = &fbcrawl.FacebookUser{
220
+ Id: fbDataFt.ContentOwnerIdNew,
221
+ Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
222
+ }
223
+ post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
224
+ //Content
204
225
 
205
- collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
206
- logger.Info("Post found at", element.Attr("href"))
207
- u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
208
- postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
209
- result = append(result, &fbcrawl.FacebookPost{
210
- Id: postId,
211
- Group: &fbcrawl.FacebookGroup{Id: groupId},
212
- })
213
- //f.detailCollector.Visit(url)
226
+ //NO BACKGROUND TEXT ONLY
227
+ post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
228
+ return selection.Text()
229
+ }), "\n")
230
+
231
+ if len(post.Content) == 0 {
232
+ // TEXT WITH BACKGROUND
233
+ post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
234
+ }
235
+
236
+ post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
237
+ post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
238
+ post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
239
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
240
+ i, _ := strconv.ParseInt(id, 10, 64)
241
+ return &fbcrawl.FacebookImage{
242
+ Id: i,
243
+ }
244
+ })).([]*fbcrawl.FacebookImage)
245
+
246
+ if fbDataFt.PhotoId > 0 {
247
+ post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
248
+ }
249
+ result = append(result, post)
214
250
  })
215
251
 
216
252
  err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
@@ -300,7 +336,7 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
300
336
  }
301
337
 
302
338
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
303
- post.ReactionCount = getReactionFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
339
+ post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
304
340
  post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
305
341
  i, _ := strconv.ParseInt(id, 10, 64)
306
342
  return &fbcrawl.FacebookImage{
@@ -311,10 +347,6 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
311
347
  if result.PhotoId > 0 {
312
348
  post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
313
349
  }
314
-
315
- logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
316
- return selection.Text()
317
- }), "\n"))
318
350
  }
319
351
 
320
352
  //Comment
@@ -375,11 +407,18 @@ func getImageIdFromHref(href string) int64 {
375
407
  return i
376
408
  }
377
409
 
378
- func getReactionFromText(text string) int64 {
410
+ func getNumberFromText(text string) int64 {
379
411
  logger.Error("reaction", text)
380
412
  if len(text) > 0 {
381
- id, _ := strconv.ParseInt(regexp.MustCompile("(\\d+)").FindStringSubmatch(text)[1], 10, 64)
382
- return id
413
+ match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
414
+ count, _ := strconv.ParseInt(match[1], 10, 64)
415
+ switch match[2] {
416
+ case "k":
417
+ count *= 1000
418
+ case "m":
419
+ count *= 1000000
420
+ }
421
+ return count
383
422
  }
384
423
  return 0
385
424
  }
@@ -25,6 +25,7 @@ message FacebookPost {
25
25
  repeated FacebookComment comments = 5;
26
26
  int64 created_at = 9;
27
27
  int64 reaction_count = 10;
28
+ int64 comment_count = 11;
28
29
  }
29
30
 
30
31
  message FacebookImage {
@@ -1,3 +1,3 @@
1
1
  module FbcrawlColly
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fbcrawl-colly
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duy Le
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-13 00:00:00.000000000 Z
11
+ date: 2020-08-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi