fbcrawl-colly 0.2.4 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -2
- data/Dockerfile +14 -0
- data/Gemfile.lock +7 -6
- data/README.md +1 -1
- data/fbcrawl-colly.gemspec +2 -6
- data/fbcrawl.proto +84 -4
- data/{fbcolly → fbcrawl}/fbcolly.go +186 -110
- data/fbcrawl/pb/fbcrawl.pb.go +1793 -0
- data/fbcrawl/pb/fbcrawl_grpc.pb.go +344 -0
- data/go.mod +4 -11
- data/go.sum +17 -10
- data/lib/fbcrawl-colly.rb +1 -4
- data/lib/fbcrawl_colly.rb +5 -0
- data/lib/fbcrawl_colly/client.rb +54 -0
- data/lib/fbcrawl_colly/version.rb +1 -1
- data/lib/pb/fbcrawl_pb.rb +127 -0
- data/lib/pb/fbcrawl_services_pb.rb +30 -0
- data/main.go +71 -80
- metadata +14 -26
- data/ext/fbcrawl_colly/.gitignore +0 -2
- data/ext/fbcrawl_colly/Makefile +0 -6
- data/ext/fbcrawl_colly/extconf.rb +0 -6
- data/lib/fbcrawl_colly/colly.rb +0 -57
- data/lib/fbcrawl_colly/ffi.rb +0 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f420f1cc6b4b6260b8106422e80c0488a351fb21fdcaa0ad1af56b0402beb8da
|
4
|
+
data.tar.gz: 86193bbf6bfab5b48f73ad4f2594f5e4ff0761901ec5653af90ea638cbf9def6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c6396d3572f0beaedaf198388e688e4f3bed09e6920c260cfddf61c36267c97e2a6babea3d297e07d1b7a816f6b416ec7c6d1a27775d85bea23dc4b5b07d9a51
|
7
|
+
data.tar.gz: a94d9ed5fc604e6e001abfb7f93ac9a82a075b686ada0214e854761ea0aa84712651b6605a7f885dcadc45be1dcad7d8bf2ac580f0e61dd2d3b086b02849c440
|
data/.gitignore
CHANGED
data/Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
FROM golang:1.14-alpine
|
2
|
+
RUN apk add --no-cache git build-base tzdata
|
3
|
+
|
4
|
+
RUN mkdir -p /app
|
5
|
+
WORKDIR /app
|
6
|
+
ADD ./go.mod /app
|
7
|
+
ADD ./go.sum /app
|
8
|
+
ADD ./ /app
|
9
|
+
RUN go get
|
10
|
+
|
11
|
+
|
12
|
+
ENV PORT 3000
|
13
|
+
RUN go build -o server qnetwork.net/fbcrawl
|
14
|
+
ENTRYPOINT ["./server"]
|
data/Gemfile.lock
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fbcrawl-colly (
|
5
|
-
ffi
|
4
|
+
fbcrawl-colly (1.1.0)
|
6
5
|
google-protobuf
|
6
|
+
grpc
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
ffi (1.13.1)
|
12
11
|
google-protobuf (3.13.0)
|
12
|
+
googleapis-common-protos-types (1.0.5)
|
13
|
+
google-protobuf (~> 3.11)
|
14
|
+
grpc (1.31.1)
|
15
|
+
google-protobuf (~> 3.12)
|
16
|
+
googleapis-common-protos-types (~> 1.0)
|
13
17
|
minitest (5.14.1)
|
14
18
|
rake (12.3.3)
|
15
|
-
rake-compiler (1.1.1)
|
16
|
-
rake
|
17
19
|
|
18
20
|
PLATFORMS
|
19
21
|
ruby
|
@@ -22,7 +24,6 @@ DEPENDENCIES
|
|
22
24
|
fbcrawl-colly!
|
23
25
|
minitest (~> 5.0)
|
24
26
|
rake (~> 12.0)
|
25
|
-
rake-compiler
|
26
27
|
|
27
28
|
BUNDLED WITH
|
28
29
|
2.1.4
|
data/README.md
CHANGED
data/fbcrawl-colly.gemspec
CHANGED
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
end
|
26
26
|
# spec.bindir = "exe"
|
27
27
|
# spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
-
spec.
|
29
|
-
'ext/fbcrawl_colly/extconf.rb'
|
30
|
-
]
|
31
|
-
spec.require_paths = ["lib"]
|
28
|
+
spec.require_paths = %w[lib lib/pb]
|
32
29
|
|
33
|
-
spec.add_runtime_dependency 'ffi'
|
34
30
|
spec.add_runtime_dependency 'google-protobuf'
|
35
|
-
spec.
|
31
|
+
spec.add_runtime_dependency 'grpc'
|
36
32
|
end
|
data/fbcrawl.proto
CHANGED
@@ -1,9 +1,80 @@
|
|
1
1
|
syntax = "proto3";
|
2
2
|
|
3
3
|
package fbcrawl_colly;
|
4
|
-
option go_package = "./fbcrawl;
|
4
|
+
option go_package = "./fbcrawl/pb;pb";
|
5
|
+
|
6
|
+
service Grpc {
|
7
|
+
// Sends a greeting
|
8
|
+
rpc Login (LoginRequest) returns (LoginResponse) {}
|
9
|
+
rpc FetchMyGroups (FetchMyGroupsRequest) returns (FacebookGroupList) {}
|
10
|
+
rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
|
11
|
+
rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
|
12
|
+
rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
|
13
|
+
rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
|
14
|
+
rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
|
15
|
+
rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
|
16
|
+
}
|
17
|
+
|
18
|
+
message Context {
|
19
|
+
string cookies = 1;
|
20
|
+
}
|
21
|
+
|
22
|
+
message LoginRequest {
|
23
|
+
string email = 2;
|
24
|
+
string password = 3;
|
25
|
+
string totp_secret = 4;
|
26
|
+
}
|
27
|
+
|
28
|
+
message LoginResponse {
|
29
|
+
string cookies = 1;
|
30
|
+
}
|
31
|
+
|
32
|
+
message LoginWithCookiesRequest {
|
33
|
+
string cookies = 1;
|
34
|
+
}
|
35
|
+
|
36
|
+
message FetchMyGroupsRequest {
|
37
|
+
Context context = 1;
|
38
|
+
}
|
39
|
+
|
40
|
+
message FetchGroupInfoRequest {
|
41
|
+
Context context = 1;
|
42
|
+
string group_username = 2;
|
43
|
+
}
|
44
|
+
|
45
|
+
message FetchUserInfoRequest {
|
46
|
+
Context context = 1;
|
47
|
+
string username = 2;
|
48
|
+
}
|
49
|
+
|
50
|
+
message FetchGroupFeedRequest {
|
51
|
+
Context context = 1;
|
52
|
+
int64 group_id = 2;
|
53
|
+
string next_cursor = 3;
|
54
|
+
}
|
55
|
+
|
56
|
+
message FetchPostRequest {
|
57
|
+
Context context = 1;
|
58
|
+
int64 group_id = 2;
|
59
|
+
int64 post_id = 3;
|
60
|
+
string comment_next_cursor = 4;
|
61
|
+
}
|
62
|
+
|
63
|
+
message FetchContentImagesRequest {
|
64
|
+
Context context = 1;
|
65
|
+
int64 post_id = 2;
|
66
|
+
string next_cursor = 3;
|
67
|
+
}
|
68
|
+
|
69
|
+
message FetchImageUrlRequest {
|
70
|
+
Context context = 1;
|
71
|
+
int64 image_id = 2;
|
72
|
+
}
|
73
|
+
|
74
|
+
message FacebookGroupList {
|
75
|
+
repeated FacebookGroup groups = 1;
|
76
|
+
}
|
5
77
|
|
6
|
-
// The request message containing the user's name.
|
7
78
|
message FacebookGroup {
|
8
79
|
int64 id = 1;
|
9
80
|
string name = 2;
|
@@ -13,6 +84,8 @@ message FacebookGroup {
|
|
13
84
|
message FacebookUser {
|
14
85
|
int64 id = 1;
|
15
86
|
string name = 2;
|
87
|
+
string username = 3;
|
88
|
+
int64 friend_count = 4;
|
16
89
|
}
|
17
90
|
|
18
91
|
message FacebookPost {
|
@@ -20,15 +93,20 @@ message FacebookPost {
|
|
20
93
|
FacebookGroup group = 2;
|
21
94
|
FacebookUser user = 3;
|
22
95
|
string content = 4;
|
96
|
+
CommentList comments = 5;
|
23
97
|
string content_link = 6;
|
24
|
-
FacebookImage content_image = 8;
|
25
98
|
repeated FacebookImage content_images = 7;
|
26
|
-
|
99
|
+
FacebookImage content_image = 8;
|
27
100
|
int64 created_at = 9;
|
28
101
|
int64 reaction_count = 10;
|
29
102
|
int64 comment_count = 11;
|
30
103
|
}
|
31
104
|
|
105
|
+
message CommentList {
|
106
|
+
repeated FacebookComment comments = 5;
|
107
|
+
string next_cursor = 12;
|
108
|
+
}
|
109
|
+
|
32
110
|
message FacebookImage {
|
33
111
|
int64 id = 1;
|
34
112
|
string url = 2;
|
@@ -44,8 +122,10 @@ message FacebookComment {
|
|
44
122
|
|
45
123
|
message FacebookPostList {
|
46
124
|
repeated FacebookPost posts = 1;
|
125
|
+
string next_cursor = 2;
|
47
126
|
}
|
48
127
|
|
49
128
|
message FacebookImageList {
|
50
129
|
repeated FacebookImage images = 1;
|
130
|
+
string next_cursor = 2;
|
51
131
|
}
|
@@ -6,16 +6,17 @@ import (
|
|
6
6
|
"errors"
|
7
7
|
"fmt"
|
8
8
|
"github.com/PuerkitoBio/goquery"
|
9
|
-
"github.com/gocolly/colly"
|
10
|
-
"github.com/gocolly/colly/extensions"
|
11
|
-
"github.com/gocolly/colly/storage"
|
9
|
+
"github.com/gocolly/colly/v2"
|
10
|
+
"github.com/gocolly/colly/v2/extensions"
|
11
|
+
"github.com/gocolly/colly/v2/storage"
|
12
12
|
"github.com/google/logger"
|
13
13
|
"github.com/olebedev/when"
|
14
14
|
"github.com/olebedev/when/rules/common"
|
15
15
|
"github.com/olebedev/when/rules/en"
|
16
16
|
"github.com/thoas/go-funk"
|
17
|
+
"github.com/xlzd/gotp"
|
17
18
|
"net/url"
|
18
|
-
"qnetwork.net/fbcrawl/fbcrawl"
|
19
|
+
"qnetwork.net/fbcrawl/fbcrawl/pb"
|
19
20
|
"regexp"
|
20
21
|
"strconv"
|
21
22
|
"strings"
|
@@ -33,7 +34,7 @@ type FbDataInsight struct {
|
|
33
34
|
FbDataPostContext `json:"post_context"`
|
34
35
|
}
|
35
36
|
type FbDataFt struct {
|
36
|
-
ContentOwnerIdNew
|
37
|
+
ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
|
37
38
|
PhotoAttachmentsList []string `json:"photo_attachments_list"`
|
38
39
|
PhotoId int64 `json:"photo_id,string"`
|
39
40
|
PageId int64 `json:"page_id,string"`
|
@@ -41,30 +42,38 @@ type FbDataFt struct {
|
|
41
42
|
PageInsights map[string]FbDataInsight `json:"page_insights"`
|
42
43
|
}
|
43
44
|
|
44
|
-
func sharedOnRequest(request *colly.Request) {
|
45
|
-
logger.Info("OnRequest")
|
46
|
-
//request.Headers.Set("Host", "facebook.com")
|
47
|
-
request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
|
48
|
-
request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
49
|
-
request.Headers.Set("origin", "https://mbasic.facebook.com")
|
50
|
-
|
51
|
-
//logger.Info("Saved referrer is", request.Ctx.Get("_referer"))
|
52
|
-
request.Headers.Set("referer", "https://mbasic.facebook.com/checkpoint/?_rdr")
|
53
|
-
request.Headers.Set("cache-control", "max-age=0")
|
54
|
-
request.Headers.Set("upgrade-insecure-requests", "1")
|
55
|
-
//accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
|
56
|
-
//origin: https://mbasic.facebook.com
|
57
|
-
//referer: https://mbasic.facebook.com/checkpoint/?_rdr
|
58
|
-
request.Headers.Set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36")
|
59
|
-
request.ResponseCharacterEncoding = "utf-8"
|
60
|
-
}
|
61
|
-
|
62
45
|
func setupSharedCollector(collector *colly.Collector) error {
|
63
46
|
var err error
|
64
47
|
extensions.Referer(collector)
|
65
48
|
collector.AllowURLRevisit = true
|
66
|
-
|
67
|
-
collector.
|
49
|
+
var lastUrl string
|
50
|
+
collector.OnRequest(func(request *colly.Request) {
|
51
|
+
lastUrl = request.URL.RawPath
|
52
|
+
logger.Info("OnRequest ", request.URL)
|
53
|
+
//request.Headers.Set("Host", "facebook.com")
|
54
|
+
request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
|
55
|
+
request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
56
|
+
request.Headers.Set("origin", "https://mbasic.facebook.com")
|
57
|
+
|
58
|
+
//logger.Info("Saved referrer is", request.Ctx.Get("_referer"))
|
59
|
+
request.Headers.Set("referer", "https://mbasic.facebook.com/checkpoint/?_rdr")
|
60
|
+
request.Headers.Set("cache-control", "max-age=0")
|
61
|
+
request.Headers.Set("upgrade-insecure-requests", "1")
|
62
|
+
//accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
|
63
|
+
//origin: https://mbasic.facebook.com
|
64
|
+
//referer: https://mbasic.facebook.com/checkpoint/?_rdr
|
65
|
+
request.Headers.Set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36")
|
66
|
+
request.ResponseCharacterEncoding = "utf-8"
|
67
|
+
})
|
68
|
+
collector.OnResponse(func(response *colly.Response) {
|
69
|
+
logger.Info("OnResponse ./last.html")
|
70
|
+
_ = response.Save("./last.html")
|
71
|
+
//logger.Info(string(resp.Body))
|
72
|
+
})
|
73
|
+
|
74
|
+
collector.OnHTML("a[href*=\"177066345680802\"", func(element *colly.HTMLElement) {
|
75
|
+
logger.Error("RateLimit reached ", lastUrl)
|
76
|
+
})
|
68
77
|
collector.OnError(func(resp *colly.Response, errHttp error) {
|
69
78
|
err = errHttp
|
70
79
|
logger.Error("OnError", err)
|
@@ -72,12 +81,6 @@ func setupSharedCollector(collector *colly.Collector) error {
|
|
72
81
|
return err
|
73
82
|
}
|
74
83
|
|
75
|
-
func sharedOnResponse(response *colly.Response) {
|
76
|
-
logger.Info("OnResponse ./last.html")
|
77
|
-
_ = response.Save("./last.html")
|
78
|
-
//logger.Info(string(resp.Body))
|
79
|
-
}
|
80
|
-
|
81
84
|
func getForm(element *colly.HTMLElement, err error) (string, error, map[string]string) {
|
82
85
|
submitUrl, exists := element.DOM.Attr("action")
|
83
86
|
if !exists {
|
@@ -106,7 +109,7 @@ func New() *Fbcolly {
|
|
106
109
|
return &f
|
107
110
|
}
|
108
111
|
|
109
|
-
func (f *Fbcolly) Login(email string, password string,
|
112
|
+
func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
|
110
113
|
collector := f.collector.Clone()
|
111
114
|
err := setupSharedCollector(collector)
|
112
115
|
|
@@ -155,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
155
158
|
//logger.Info("Please input OTP")
|
156
159
|
//reader := bufio.NewReader(os.Stdin)
|
157
160
|
//code, _ := reader.ReadString('\n')
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
+
if len(totpSecret) > 0 {
|
162
|
+
code := gotp.NewDefaultTOTP(totpSecret).Now()
|
163
|
+
reqMap["approvals_code"] = code
|
164
|
+
shouldSubmit = true
|
165
|
+
}
|
166
|
+
|
161
167
|
} else {
|
162
168
|
logger.Info("OnHTML Only Continue checkpoint")
|
163
169
|
|
@@ -191,22 +197,17 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
191
197
|
|
192
198
|
}
|
193
199
|
|
194
|
-
func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *
|
200
|
+
func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
|
195
201
|
collector := f.collector.Clone()
|
196
202
|
err := setupSharedCollector(collector)
|
197
|
-
|
198
|
-
var result []*fbcrawl.FacebookPost
|
203
|
+
result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
|
199
204
|
|
200
205
|
collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
|
201
|
-
|
202
|
-
if currentPage < 3 {
|
203
|
-
logger.Info("Will fetch page", currentPage)
|
204
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
205
|
-
}
|
206
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
206
207
|
})
|
207
|
-
collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
|
208
|
+
collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
|
208
209
|
dataElement := element
|
209
|
-
post := &
|
210
|
+
post := &pb.FacebookPost{}
|
210
211
|
var fbDataFt FbDataFt
|
211
212
|
jsonData := dataElement.Attr("data-ft")
|
212
213
|
|
@@ -218,9 +219,10 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
|
|
218
219
|
}
|
219
220
|
logger.Info("Post ", fbDataFt)
|
220
221
|
post.Id = fbDataFt.TopLevelPostId
|
221
|
-
post.Group = &
|
222
|
-
|
223
|
-
|
222
|
+
post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
|
223
|
+
userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
|
224
|
+
post.User = &pb.FacebookUser{
|
225
|
+
Id: userId,
|
224
226
|
Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
|
225
227
|
}
|
226
228
|
post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
|
@@ -239,77 +241,110 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
|
|
239
241
|
post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
240
242
|
post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
|
241
243
|
post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
|
242
|
-
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *
|
244
|
+
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
243
245
|
i, _ := strconv.ParseInt(id, 10, 64)
|
244
|
-
return &
|
246
|
+
return &pb.FacebookImage{
|
245
247
|
Id: i,
|
246
248
|
}
|
247
|
-
})).([]*
|
249
|
+
})).([]*pb.FacebookImage)
|
248
250
|
|
249
251
|
if fbDataFt.PhotoId > 0 {
|
250
|
-
post.ContentImage = &
|
252
|
+
post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
|
251
253
|
}
|
252
|
-
result = append(result, post)
|
254
|
+
result.Posts = append(result.Posts, post)
|
255
|
+
})
|
256
|
+
if len(nextCursor) > 0 {
|
257
|
+
err = collector.Visit(nextCursor)
|
258
|
+
} else {
|
259
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
260
|
+
}
|
261
|
+
|
262
|
+
if err != nil {
|
263
|
+
logger.Error("crawl by colly err:", err)
|
264
|
+
}
|
265
|
+
return err, &result
|
266
|
+
}
|
267
|
+
|
268
|
+
func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
|
269
|
+
collector := f.collector.Clone()
|
270
|
+
err := setupSharedCollector(collector)
|
271
|
+
|
272
|
+
result := &pb.FacebookUser{}
|
273
|
+
|
274
|
+
collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
|
275
|
+
parsed, _ := url.Parse(element.Attr("href"))
|
276
|
+
result.Username = strings.Split(parsed.Path[1:], "/")[0]
|
277
|
+
result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
|
278
|
+
})
|
279
|
+
|
280
|
+
collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
|
281
|
+
result.FriendCount = getNumberFromText(element.Text)
|
282
|
+
})
|
283
|
+
|
284
|
+
collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
|
285
|
+
result.Name = element.DOM.Find("strong").First().Text()
|
253
286
|
})
|
254
287
|
|
255
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com
|
288
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
|
256
289
|
if err != nil {
|
257
290
|
logger.Error("crawl by colly err:", err)
|
258
291
|
}
|
259
|
-
return err,
|
292
|
+
return err, result
|
260
293
|
}
|
261
294
|
|
262
|
-
func (f *Fbcolly) FetchGroupInfo(
|
295
|
+
func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
|
263
296
|
collector := f.collector.Clone()
|
264
297
|
err := setupSharedCollector(collector)
|
265
|
-
result :=
|
298
|
+
result := &pb.FacebookGroup{}
|
266
299
|
|
267
300
|
collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
|
268
301
|
result.Name = element.Text
|
269
302
|
})
|
270
303
|
collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
|
271
|
-
result.
|
304
|
+
result.Id = getNumberFromText(element.Attr("href"))
|
305
|
+
result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
|
272
306
|
})
|
273
307
|
|
274
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%
|
308
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
|
275
309
|
if err != nil {
|
276
310
|
logger.Error("crawl by colly err:", err)
|
277
311
|
}
|
278
312
|
return err, result
|
279
313
|
}
|
280
314
|
|
281
|
-
func (f *Fbcolly) FetchContentImages(postId int64) (error, *
|
315
|
+
func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
|
282
316
|
collector := f.collector.Clone()
|
283
317
|
err := setupSharedCollector(collector)
|
284
|
-
|
285
|
-
var result []*fbcrawl.FacebookImage
|
318
|
+
result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
|
286
319
|
|
287
320
|
collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
|
288
|
-
|
289
|
-
logger.Info("Will fetch page", currentPage)
|
290
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
321
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
291
322
|
})
|
292
323
|
|
293
324
|
collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
|
294
|
-
result = append(result, &
|
325
|
+
result.Images = append(result.Images, &pb.FacebookImage{
|
295
326
|
Id: getImageIdFromHref(element.Attr("href")),
|
296
327
|
})
|
297
328
|
//f.detailCollector.Visit(url)
|
298
329
|
})
|
330
|
+
if len(nextCursor) > 0 {
|
331
|
+
err = collector.Visit(nextCursor)
|
332
|
+
} else {
|
333
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
334
|
+
}
|
299
335
|
|
300
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
301
336
|
if err != nil {
|
302
337
|
logger.Error("crawl by colly err:", err)
|
303
338
|
}
|
304
|
-
return err, &
|
339
|
+
return err, &result
|
305
340
|
}
|
306
341
|
|
307
|
-
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *
|
342
|
+
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
|
308
343
|
collector := f.collector.Clone()
|
309
344
|
err := setupSharedCollector(collector)
|
310
|
-
result :=
|
345
|
+
result := pb.FacebookImage{Id: imageId}
|
311
346
|
|
312
|
-
collector.OnHTML("a", func(element *colly.HTMLElement) {
|
347
|
+
collector.OnHTML("a[href*=\"fbcdn\"]", func(element *colly.HTMLElement) {
|
313
348
|
result.Url = element.Attr("href")
|
314
349
|
})
|
315
350
|
|
@@ -320,11 +355,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
|
|
320
355
|
return err, &result
|
321
356
|
}
|
322
357
|
|
323
|
-
func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *
|
358
|
+
func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
|
324
359
|
collector := f.collector.Clone()
|
325
360
|
err := setupSharedCollector(collector)
|
326
|
-
post := &
|
327
|
-
commentPaging := 0
|
361
|
+
post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
|
328
362
|
collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
|
329
363
|
dataElement := element.DOM.Find("div[data-ft]")
|
330
364
|
if dataElement.Length() > 0 {
|
@@ -339,9 +373,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
339
373
|
}
|
340
374
|
logger.Info("Post ", result)
|
341
375
|
post.Id = result.TopLevelPostId
|
342
|
-
post.Group = &
|
343
|
-
|
344
|
-
|
376
|
+
post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
|
377
|
+
userId, _ := result.ContentOwnerIdNew.Int64()
|
378
|
+
post.User = &pb.FacebookUser{
|
379
|
+
Id: userId,
|
345
380
|
Name: dataElement.Find("h3 strong:first-child a").Text(),
|
346
381
|
}
|
347
382
|
post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
|
@@ -359,15 +394,15 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
359
394
|
|
360
395
|
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
361
396
|
post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
|
362
|
-
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *
|
397
|
+
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
363
398
|
i, _ := strconv.ParseInt(id, 10, 64)
|
364
|
-
return &
|
399
|
+
return &pb.FacebookImage{
|
365
400
|
Id: i,
|
366
401
|
}
|
367
|
-
})).([]*
|
402
|
+
})).([]*pb.FacebookImage)
|
368
403
|
|
369
404
|
if result.PhotoId > 0 {
|
370
|
-
post.ContentImage = &
|
405
|
+
post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
|
371
406
|
}
|
372
407
|
}
|
373
408
|
|
@@ -375,32 +410,48 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
375
410
|
element.DOM.Find("h3 + div + div + div").Parent().Parent().Each(func(i int, selection *goquery.Selection) {
|
376
411
|
//author
|
377
412
|
commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
413
|
+
if commentId > 0 {
|
414
|
+
createdAtWhenResult, err := f.w.Parse(selection.Find("abbr").Text(), time.Now())
|
415
|
+
if err != nil {
|
416
|
+
logger.Error(err)
|
417
|
+
return
|
418
|
+
}
|
419
|
+
parsed, err := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
|
420
|
+
if err != nil {
|
421
|
+
logger.Error(err)
|
422
|
+
return
|
423
|
+
}
|
424
|
+
if len(parsed.Path) == 0 {
|
425
|
+
logger.Error("Empty path for commentId ", commentId)
|
426
|
+
return
|
427
|
+
}
|
428
|
+
if len(parsed.Path) > 1 {
|
429
|
+
post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
|
430
|
+
Id: commentId,
|
431
|
+
Post: &pb.FacebookPost{Id: post.Id},
|
432
|
+
User: &pb.FacebookUser{
|
433
|
+
Username: parsed.Path[1:],
|
434
|
+
Name: selection.Find("h3 > a").Text(),
|
435
|
+
},
|
436
|
+
Content: selection.Find("h3 + div").Text(),
|
437
|
+
CreatedAt: createdAtWhenResult.Time.Unix(),
|
438
|
+
})
|
439
|
+
}
|
440
|
+
}
|
390
441
|
})
|
391
442
|
|
392
443
|
}
|
393
444
|
})
|
394
445
|
|
395
446
|
collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
|
396
|
-
|
397
|
-
logger.Info("Comment paging", commentPaging)
|
398
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
399
|
-
commentPaging = commentPaging + 1
|
400
|
-
}
|
447
|
+
post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
401
448
|
})
|
449
|
+
if len(commentNextCursor) > 0 {
|
450
|
+
err = collector.Visit(commentNextCursor)
|
451
|
+
} else {
|
452
|
+
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
453
|
+
}
|
402
454
|
|
403
|
-
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
404
455
|
return err, post
|
405
456
|
}
|
406
457
|
|
@@ -409,13 +460,36 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
|
|
409
460
|
return collector.SetCookies("https://mbasic.facebook.com/", storage.UnstringifyCookies(cookies))
|
410
461
|
}
|
411
462
|
|
463
|
+
func (f *Fbcolly) FetchMyGroups() (error, *pb.FacebookGroupList) {
|
464
|
+
collector := f.collector.Clone()
|
465
|
+
err := setupSharedCollector(collector)
|
466
|
+
result := &pb.FacebookGroupList{Groups: []*pb.FacebookGroup{}}
|
467
|
+
|
468
|
+
collector.OnHTML("li table a", func(element *colly.HTMLElement) {
|
469
|
+
result.Groups = append(result.Groups, &pb.FacebookGroup{
|
470
|
+
Id: getNumberFromText(element.Attr("href")),
|
471
|
+
Name: element.Text,
|
472
|
+
})
|
473
|
+
})
|
474
|
+
|
475
|
+
err = collector.Visit("https://mbasic.facebook.com/groups/?seemore")
|
476
|
+
if err != nil {
|
477
|
+
logger.Error("crawl by colly err:", err)
|
478
|
+
}
|
479
|
+
return err, result
|
480
|
+
}
|
481
|
+
|
412
482
|
//func getUsernameFromHref(href string) string {
|
413
483
|
// return regexp.MustCompile("/([\\d\\w.]+).*").FindStringSubmatch(href)[1]
|
414
484
|
//}
|
415
485
|
|
416
486
|
func getUserIdFromCommentHref(href string) int64 {
|
417
|
-
|
418
|
-
|
487
|
+
match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
|
488
|
+
if len(match) > 0 {
|
489
|
+
id, _ := strconv.ParseInt(match[1], 10, 64)
|
490
|
+
return id
|
491
|
+
}
|
492
|
+
return 0
|
419
493
|
}
|
420
494
|
|
421
495
|
func getUrlFromRedirectHref(href string) string {
|
@@ -430,17 +504,19 @@ func getImageIdFromHref(href string) int64 {
|
|
430
504
|
}
|
431
505
|
|
432
506
|
func getNumberFromText(text string) int64 {
|
433
|
-
logger.
|
507
|
+
logger.Info("getNumberFromText ", text)
|
434
508
|
if len(text) > 0 {
|
435
|
-
match := regexp.MustCompile("(\\d
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
509
|
+
match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
|
510
|
+
if len(match) > 0 {
|
511
|
+
count, _ := strconv.ParseInt(match[1], 10, 64)
|
512
|
+
switch match[2] {
|
513
|
+
case "k":
|
514
|
+
count *= 1000
|
515
|
+
case "m":
|
516
|
+
count *= 1000000
|
517
|
+
}
|
518
|
+
return count
|
442
519
|
}
|
443
|
-
return count
|
444
520
|
}
|
445
521
|
return 0
|
446
522
|
}
|