fbcrawl-colly 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/fbcolly/fbcolly.go +56 -17
- data/fbcrawl.proto +1 -0
- data/lib/fbcrawl_colly/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e43a203c1c38131f0d9eb32b67a1859bd1530aac9249751ff0329a5901388f6
|
4
|
+
data.tar.gz: f3520762f00d2b1a2d475a9b97d2141fe8cd98faf718ccf3d8a797e61489f976
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2560aa6a3239671a14f82a95034bbad03684dd44b0fc32cb2092b404678b2ec0e124a33db64a92f5c22f4cd3d9d94a42ccc40a6ab2e1e362088d0f02f05d22f3
|
7
|
+
data.tar.gz: 305f3d463d75bc3edcfcd1b5a985d9a9882e990e850786132eb1b6d4e93f4e3e31a2c0e9aeb23141cc0633cd1d5ad8e1d88ec4c0e3e4d2d201a6298a7c8699c6
|
data/Gemfile.lock
CHANGED
data/fbcolly/fbcolly.go
CHANGED
@@ -201,16 +201,52 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
|
|
201
201
|
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
202
202
|
}
|
203
203
|
})
|
204
|
+
collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
|
205
|
+
dataElement := element
|
206
|
+
post := &fbcrawl.FacebookPost{}
|
207
|
+
var fbDataFt FbDataFt
|
208
|
+
jsonData := dataElement.Attr("data-ft")
|
209
|
+
|
210
|
+
logger.Info(jsonData)
|
211
|
+
err = json.Unmarshal([]byte(jsonData), &fbDataFt)
|
212
|
+
if err != nil {
|
213
|
+
logger.Error(err)
|
214
|
+
return
|
215
|
+
}
|
216
|
+
logger.Info("Post ", fbDataFt)
|
217
|
+
post.Id = fbDataFt.TopLevelPostId
|
218
|
+
post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
|
219
|
+
post.User = &fbcrawl.FacebookUser{
|
220
|
+
Id: fbDataFt.ContentOwnerIdNew,
|
221
|
+
Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
|
222
|
+
}
|
223
|
+
post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
|
224
|
+
//Content
|
204
225
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
226
|
+
//NO BACKGROUND TEXT ONLY
|
227
|
+
post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
228
|
+
return selection.Text()
|
229
|
+
}), "\n")
|
230
|
+
|
231
|
+
if len(post.Content) == 0 {
|
232
|
+
// TEXT WITH BACKGROUND
|
233
|
+
post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
|
234
|
+
}
|
235
|
+
|
236
|
+
post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
237
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
|
238
|
+
post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
|
239
|
+
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
|
240
|
+
i, _ := strconv.ParseInt(id, 10, 64)
|
241
|
+
return &fbcrawl.FacebookImage{
|
242
|
+
Id: i,
|
243
|
+
}
|
244
|
+
})).([]*fbcrawl.FacebookImage)
|
245
|
+
|
246
|
+
if fbDataFt.PhotoId > 0 {
|
247
|
+
post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
|
248
|
+
}
|
249
|
+
result = append(result, post)
|
214
250
|
})
|
215
251
|
|
216
252
|
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
@@ -300,7 +336,7 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
300
336
|
}
|
301
337
|
|
302
338
|
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
303
|
-
post.ReactionCount =
|
339
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
|
304
340
|
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
|
305
341
|
i, _ := strconv.ParseInt(id, 10, 64)
|
306
342
|
return &fbcrawl.FacebookImage{
|
@@ -311,10 +347,6 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
311
347
|
if result.PhotoId > 0 {
|
312
348
|
post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
|
313
349
|
}
|
314
|
-
|
315
|
-
logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
316
|
-
return selection.Text()
|
317
|
-
}), "\n"))
|
318
350
|
}
|
319
351
|
|
320
352
|
//Comment
|
@@ -375,11 +407,18 @@ func getImageIdFromHref(href string) int64 {
|
|
375
407
|
return i
|
376
408
|
}
|
377
409
|
|
378
|
-
func
|
410
|
+
func getNumberFromText(text string) int64 {
|
379
411
|
logger.Error("reaction", text)
|
380
412
|
if len(text) > 0 {
|
381
|
-
|
382
|
-
|
413
|
+
match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
|
414
|
+
count, _ := strconv.ParseInt(match[1], 10, 64)
|
415
|
+
switch match[2] {
|
416
|
+
case "k":
|
417
|
+
count *= 1000
|
418
|
+
case "m":
|
419
|
+
count *= 1000000
|
420
|
+
}
|
421
|
+
return count
|
383
422
|
}
|
384
423
|
return 0
|
385
424
|
}
|
data/fbcrawl.proto
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fbcrawl-colly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Duy Le
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|