fbcrawl-colly 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 96d233862f21d1dff4a447d57f0f4a4ee6311506d7b5cf1990d75bf92043defa
4
- data.tar.gz: a238ee4a34eac5f33fa90bba0d5bdb22169bd0b54034682f8fbb25c1ca4de039
3
+ metadata.gz: 22ec88aaf52a44344fc32ab22f6914f264b49ea10912c527c030ffd97ac12d36
4
+ data.tar.gz: d007e4326b15c64e725009548c8e9dac00e263da888d736876b6093a4fba5108
5
5
  SHA512:
6
- metadata.gz: 36d58d4ef38bf94164f94fe0a841171cc5bdd12a4f2503e818858d77cae029e1fc79cddd3213b2bb9e130e70833703df97ad18700d92c2cc3e31401b8e1f1443
7
- data.tar.gz: 5c56a623910f87369812c0d74908e618490b4aeead9d22b055f8076ca040ba9fe5fff0bb380c44e7e435e0d83a13823b259b0f6c57233d7fb3ec38ff83de0832
6
+ metadata.gz: 448bbb6b045d3b4baa3c0eae23bad85f0daf93159f514ff77e0f9383350b1c52aa3ab5360646099e56ccc19eea0bc8300ba42e67a26262e1e1f3141be61515c0
7
+ data.tar.gz: 5d326d26fb0354ea78236849be7328ce49f919435c7c373130f19439c2447f9026ce31f88bc64d5cc72581fadc618a0891cd777f885e438f01217295b58309e5
data/.gitignore CHANGED
@@ -14,3 +14,6 @@ last.html
14
14
  *.db
15
15
  /fbcrawl/fbcrawl.pb.go
16
16
  /lib/fbcrawl_pb.rb
17
+
18
+ mkmf.log
19
+ .rakeTasks
@@ -0,0 +1,28 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ fbcrawl-colly (0.1.1)
5
+ ffi
6
+ google-protobuf
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ffi (1.13.1)
12
+ google-protobuf (3.12.4-universal-darwin)
13
+ minitest (5.14.1)
14
+ rake (12.3.3)
15
+ rake-compiler (1.1.1)
16
+ rake
17
+
18
+ PLATFORMS
19
+ ruby
20
+
21
+ DEPENDENCIES
22
+ fbcrawl-colly!
23
+ minitest (~> 5.0)
24
+ rake (~> 12.0)
25
+ rake-compiler
26
+
27
+ BUNDLED WITH
28
+ 2.1.4
data/Rakefile CHANGED
@@ -7,4 +7,13 @@ Rake::TestTask.new(:test) do |t|
7
7
  t.test_files = FileList["test/**/*_test.rb"]
8
8
  end
9
9
 
10
+ task :fbcrawl_colly do
11
+ Dir.chdir("./ext/fbcrawl_colly/") do
12
+ require './extconf'
13
+ `make`
14
+ end
15
+ end
16
+
17
+ task :compile => [:fbcrawl_colly]
18
+ task :test => :compile
10
19
  task :default => :test
@@ -1,5 +1,6 @@
1
1
  require 'mkmf'
2
- MakeMakefile::find_executable 'go'
3
- MakeMakefile::find_executable 'protoc'
4
- MakeMakefile::find_executable 'protoc-gen-go'
5
- $makefile_created = true
2
+ requirement_passed = true
3
+ requirement_passed &&= MakeMakefile::find_executable 'go'
4
+ requirement_passed &&= MakeMakefile::find_executable 'protoc'
5
+ requirement_passed &&= MakeMakefile::find_executable 'protoc-gen-go'
6
+ $makefile_created = requirement_passed
@@ -10,14 +10,35 @@ import (
10
10
  "github.com/gocolly/colly/extensions"
11
11
  "github.com/gocolly/colly/storage"
12
12
  "github.com/google/logger"
13
+ "github.com/olebedev/when"
14
+ "github.com/olebedev/when/rules/common"
15
+ "github.com/olebedev/when/rules/en"
16
+ "github.com/thoas/go-funk"
13
17
  "net/url"
14
18
  "qnetwork.net/fbcrawl/fbcrawl"
15
19
  "regexp"
20
+ "strconv"
16
21
  "strings"
22
+ "time"
17
23
  )
18
24
 
19
25
  type Fbcolly struct {
20
26
  collector *colly.Collector
27
+ w *when.Parser
28
+ }
29
+ type FbDataPostContext struct {
30
+ PublishTime int64 `json:"publish_time"`
31
+ }
32
+ type FbDataInsight struct {
33
+ FbDataPostContext `json:"post_context"`
34
+ }
35
+ type FbDataFt struct {
36
+ ContentOwnerIdNew int64 `json:"content_owner_id_new"`
37
+ PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
+ PhotoId int64 `json:"photo_id,string"`
39
+ PageId int64 `json:"page_id,string"`
40
+ TopLevelPostId int64 `json:"top_level_post_id,string"`
41
+ PageInsights map[string]FbDataInsight `json:"page_insights"`
21
42
  }
22
43
 
23
44
  func sharedOnRequest(request *colly.Request) {
@@ -79,16 +100,19 @@ func getForm(element *colly.HTMLElement, err error) (string, error, map[string]s
79
100
  func New() *Fbcolly {
80
101
  f := Fbcolly{}
81
102
  f.collector = colly.NewCollector()
103
+ f.w = when.New(nil)
104
+ f.w.Add(en.All...)
105
+ f.w.Add(common.All...)
82
106
  return &f
83
107
  }
84
108
 
85
- func (f *Fbcolly) Login(email string, password string, otp string) error {
109
+ func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
86
110
  collector := f.collector.Clone()
87
- setupSharedCollector(collector)
111
+ err := setupSharedCollector(collector)
88
112
 
89
113
  logger.Info("Login using email", email)
114
+ loggedIn := false
90
115
 
91
- var err error
92
116
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
93
117
  logger.Info("OnHTML login_form")
94
118
  loginURL, err, reqMap := getForm(element, err)
@@ -106,12 +130,13 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
106
130
  })
107
131
 
108
132
  collector.OnHTML("a[href=\"/login/save-device/cancel/?flow=interstitial_nux&nux_source=regular_login\"]", func(element *colly.HTMLElement) {
109
- collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
133
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
110
134
  })
111
135
 
112
136
  collector.OnHTML("form[action=\"/login/checkpoint/\"]", func(element *colly.HTMLElement) {
113
137
 
114
138
  checkpointUrl, err, reqMap := getForm(element, err)
139
+ shouldSubmit := false
115
140
  if err != nil {
116
141
  logger.Error(err)
117
142
  return
@@ -121,6 +146,7 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
121
146
  //Save Device
122
147
  logger.Info("OnHTML Save Device checkpoint")
123
148
  reqMap["name_action_selected"] = "dont_save"
149
+ shouldSubmit = true
124
150
  } else if element.DOM.Find("input[name=\"approvals_code\"]").Length() > 0 {
125
151
  logger.Info("OnHTML OTP checkpoint")
126
152
  //logger.Info("Please input OTP")
@@ -128,11 +154,15 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
128
154
  //code, _ := reader.ReadString('\n')
129
155
  code := otp[0:6]
130
156
  reqMap["approvals_code"] = code
157
+ shouldSubmit = true
131
158
  } else {
132
159
  logger.Info("OnHTML Only Continue checkpoint")
160
+
161
+ }
162
+ if shouldSubmit {
163
+ logger.Info("req map:", reqMap)
164
+ err = collector.Post(checkpointUrl, reqMap)
133
165
  }
134
- logger.Info("req map:", reqMap)
135
- err = collector.Post(checkpointUrl, reqMap)
136
166
  if err != nil {
137
167
  logger.Error("post err:", err)
138
168
  }
@@ -141,19 +171,24 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
141
171
  collector.OnHTML("form[action=\"/search/\"]", func(element *colly.HTMLElement) {
142
172
  //We're in home
143
173
  logger.Info("I'm IN HOME, navigate to page now")
174
+ loggedIn = true
144
175
  })
145
176
 
146
177
  err = collector.Visit("https://mbasic.facebook.com/")
147
178
  if err != nil {
148
179
  logger.Error("crawl by colly err:", err)
149
180
  }
150
- logger.Info(storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")))
151
- //return err, storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/"))
152
- //return err, collector.getS.Cookies("https://mbasic.facebook.com/")
153
- return err
181
+
182
+ if loggedIn {
183
+ logger.Info(storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")))
184
+ return storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")), err
185
+ } else {
186
+ return "", err
187
+ }
188
+
154
189
  }
155
190
 
156
- func (f *Fbcolly) FetchGroupFeed(groupId string) (error, *fbcrawl.FacebookPostList) {
191
+ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
157
192
  collector := f.collector.Clone()
158
193
  err := setupSharedCollector(collector)
159
194
  currentPage := 1
@@ -163,72 +198,179 @@ func (f *Fbcolly) FetchGroupFeed(groupId string) (error, *fbcrawl.FacebookPostLi
163
198
  currentPage++
164
199
  if currentPage < 3 {
165
200
  logger.Info("Will fetch page", currentPage)
166
- collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
201
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
167
202
  }
168
203
  })
169
204
 
170
- //TODO: May not need this
171
205
  collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
206
+ logger.Info("Post found at", element.Attr("href"))
172
207
  u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
173
-
208
+ postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
174
209
  result = append(result, &fbcrawl.FacebookPost{
175
- Id: u.Query().Get("id"),
210
+ Id: postId,
176
211
  Group: &fbcrawl.FacebookGroup{Id: groupId},
177
212
  })
178
213
  //f.detailCollector.Visit(url)
179
214
  })
180
215
 
181
- err = collector.Visit("https://mbasic.facebook.com/groups/" + groupId)
216
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
182
217
  if err != nil {
183
218
  logger.Error("crawl by colly err:", err)
184
219
  }
185
220
  return err, &fbcrawl.FacebookPostList{Posts: result}
186
221
  }
187
222
 
188
- func (f *Fbcolly) FetchPost(groupId string, postId string) (error, *fbcrawl.FacebookPost) {
223
+ func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
224
+ collector := f.collector.Clone()
225
+ err := setupSharedCollector(collector)
226
+ currentPage := 1
227
+ var result []*fbcrawl.FacebookImage
228
+
229
+ collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
230
+ currentPage++
231
+ logger.Info("Will fetch page", currentPage)
232
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
233
+ })
234
+
235
+ collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
236
+ result = append(result, &fbcrawl.FacebookImage{
237
+ Id: getImageIdFromHref(element.Attr("href")),
238
+ })
239
+ //f.detailCollector.Visit(url)
240
+ })
241
+
242
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
243
+ if err != nil {
244
+ logger.Error("crawl by colly err:", err)
245
+ }
246
+ return err, &fbcrawl.FacebookImageList{Images: result}
247
+ }
248
+
249
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
189
250
  collector := f.collector.Clone()
190
251
  err := setupSharedCollector(collector)
191
- post := &fbcrawl.FacebookPost{}
252
+ result := fbcrawl.FacebookImage{Id: imageId}
253
+
254
+ collector.OnHTML("a", func(element *colly.HTMLElement) {
255
+ result.Url = element.Attr("href")
256
+ })
257
+
258
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/photo/view_full_size/?fbid=%d", imageId))
259
+ if err != nil {
260
+ logger.Error("crawl by colly err:", err)
261
+ }
262
+ return err, &result
263
+ }
264
+
265
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
266
+ collector := f.collector.Clone()
267
+ err := setupSharedCollector(collector)
268
+ post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
269
+ commentPaging := 0
192
270
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
193
271
  dataElement := element.DOM.Find("div[data-ft]")
194
272
  if dataElement.Length() > 0 {
195
- var result map[string]string
273
+ var result FbDataFt
196
274
  jsonData, isExist := dataElement.Attr("data-ft")
197
275
  if isExist {
198
- json.Unmarshal([]byte(jsonData), &result)
276
+ logger.Info(jsonData)
277
+ err = json.Unmarshal([]byte(jsonData), &result)
278
+ if err != nil {
279
+ logger.Error(err)
280
+ return
281
+ }
199
282
  logger.Info("Post ", result)
200
- post.Id = result["top_level_post_id"]
201
- post.Group = &fbcrawl.FacebookGroup{Id: result["page_id"], Name: dataElement.Find("h3 strong:last-child a").Text()}
202
- post.User = &fbcrawl.FacebookUser{Id: result["content_owner_id_new"], Name: dataElement.Find("h3 strong:first-child a").Text()}
283
+ post.Id = result.TopLevelPostId
284
+ post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
285
+ post.User = &fbcrawl.FacebookUser{
286
+ Id: result.ContentOwnerIdNew,
287
+ Name: dataElement.Find("h3 strong:first-child a").Text(),
288
+ }
289
+ post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
203
290
  //Content
291
+
292
+ //NO BACKGROUND TEXT ONLY
204
293
  post.Content = strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
205
294
  return selection.Text()
206
295
  }), "\n")
207
296
 
297
+ if len(post.Content) == 0 {
298
+ // TEXT WITH BACKGROUND
299
+ post.Content = dataElement.Find("div[style*=\"background-image:url\"]").Text()
300
+ }
301
+
302
+ post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
303
+
304
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
305
+ i, _ := strconv.ParseInt(id, 10, 64)
306
+ return &fbcrawl.FacebookImage{
307
+ Id: i,
308
+ }
309
+ })).([]*fbcrawl.FacebookImage)
310
+
311
+ if result.PhotoId > 0 {
312
+ post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
313
+ }
314
+
208
315
  logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
209
316
  return selection.Text()
210
317
  }), "\n"))
211
318
  }
212
- post.Comments = []*fbcrawl.FacebookComment{}
319
+
213
320
  //Comment
214
321
  element.DOM.Find("h3 + div + div + div").Parent().Parent().Each(func(i int, selection *goquery.Selection) {
215
322
  //author
216
- commentId := selection.AttrOr("id", "")
323
+ commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
217
324
  logger.Info("comment", commentId)
218
- //idRegex, _ := regexp.Compile("")
219
- //idRegex.FindString()
325
+ createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
220
326
  post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
221
327
  Id: commentId,
222
328
  Post: &fbcrawl.FacebookPost{Id: post.Id},
223
329
  User: &fbcrawl.FacebookUser{
224
- Id: regexp.MustCompile("/([\\d\\w.]+)").FindStringSubmatch(selection.Find("h3 > a").AttrOr("href", ""))[1],
330
+ Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
225
331
  Name: selection.Find("h3 > a").Text(),
226
332
  },
227
- Content: selection.Find("h3 + div").Text(),
333
+ Content: selection.Find("h3 + div").Text(),
334
+ CreatedAt: createdAtWhenResult.Time.Unix(),
228
335
  })
229
336
  })
337
+
338
+ }
339
+ })
340
+
341
+ collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
342
+ if commentPaging < 3 {
343
+ logger.Info("Comment paging", commentPaging)
344
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
345
+ commentPaging = commentPaging + 1
230
346
  }
231
347
  })
232
- collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%s?view=permalink&id=%s&_rdr", groupId, postId))
348
+
349
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
233
350
  return err, post
234
351
  }
352
+
353
+ func (f *Fbcolly) LoginWithCookies(cookies string) error {
354
+ collector := f.collector
355
+ return collector.SetCookies("https://mbasic.facebook.com/", storage.UnstringifyCookies(cookies))
356
+ }
357
+
358
+ //func getUsernameFromHref(href string) string {
359
+ // return regexp.MustCompile("/([\\d\\w.]+).*").FindStringSubmatch(href)[1]
360
+ //}
361
+
362
+ func getUserIdFromCommentHref(href string) int64 {
363
+ id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
364
+ return id
365
+ }
366
+
367
+ func getUrlFromRedirectHref(href string) string {
368
+ u, _ := url.Parse(href)
369
+ return u.Query().Get("u")
370
+ }
371
+
372
+ func getImageIdFromHref(href string) int64 {
373
+ u, _ := url.Parse(href)
374
+ i, _ := strconv.ParseInt(u.Query().Get("fbid"), 10, 64)
375
+ return i
376
+ }
@@ -1,4 +1,4 @@
1
- require_relative 'lib/fbcrawl-colly/version'
1
+ require_relative 'lib/fbcrawl_colly/version'
2
2
 
3
3
  Gem::Specification.new do |spec|
4
4
  spec.name = "fbcrawl-colly"
@@ -32,4 +32,5 @@ Gem::Specification.new do |spec|
32
32
 
33
33
  spec.add_runtime_dependency 'ffi'
34
34
  spec.add_runtime_dependency 'google-protobuf'
35
+ spec.add_development_dependency 'rake-compiler'
35
36
  end
@@ -1,33 +1,48 @@
1
1
  syntax = "proto3";
2
2
 
3
+ package fbcrawl_colly;
3
4
  option go_package = "./fbcrawl;fbcrawl";
4
5
 
5
6
  // The request message containing the user's name.
6
7
  message FacebookGroup {
7
- string id = 1;
8
+ int64 id = 1;
8
9
  string name = 2;
9
10
  }
10
11
 
11
12
  message FacebookUser {
12
- string id = 1;
13
+ int64 id = 1;
13
14
  string name = 2;
14
15
  }
15
16
 
16
17
  message FacebookPost {
17
- string id = 1;
18
+ int64 id = 1;
18
19
  FacebookGroup group = 2;
19
20
  FacebookUser user = 3;
20
21
  string content = 4;
22
+ string content_link = 6;
23
+ FacebookImage content_image = 8;
24
+ repeated FacebookImage content_images = 7;
21
25
  repeated FacebookComment comments = 5;
26
+ int64 created_at = 9;
27
+ }
28
+
29
+ message FacebookImage {
30
+ int64 id = 1;
31
+ string url = 2;
22
32
  }
23
33
 
24
34
  message FacebookComment {
25
- string id = 1;
35
+ int64 id = 1;
26
36
  FacebookPost post = 2;
27
37
  FacebookUser user = 3;
28
38
  string content = 4;
39
+ int64 created_at = 5;
29
40
  }
30
41
 
31
42
  message FacebookPostList {
32
43
  repeated FacebookPost posts = 1;
33
44
  }
45
+
46
+ message FacebookImageList {
47
+ repeated FacebookImage images = 1;
48
+ }
data/go.mod CHANGED
@@ -13,6 +13,7 @@ require (
13
13
  github.com/golang/protobuf v1.4.2
14
14
  github.com/google/logger v1.1.0
15
15
  github.com/kennygrant/sanitize v1.2.4 // indirect
16
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254
16
17
  github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
17
18
  github.com/temoto/robotstxt v1.1.1 // indirect
18
19
  github.com/thoas/go-funk v0.7.0
data/go.sum CHANGED
@@ -1,4 +1,6 @@
1
1
  cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
2
+ github.com/AlekSi/pointer v1.0.0 h1:KWCWzsvFxNLcmM5XmiqHsGTTsuwZMsLFwWF9Y+//bNE=
3
+ github.com/AlekSi/pointer v1.0.0/go.mod h1:1kjywbfcPFCmncIxtk6fIEub6LKrfMz3gc5QKVOSOA8=
2
4
  github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
3
5
  github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
4
6
  github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
@@ -18,6 +20,7 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
18
20
  github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
19
21
  github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
20
22
  github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
23
+ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
21
24
  github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
22
25
  github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
23
26
  github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
@@ -50,6 +53,10 @@ github.com/google/logger v1.1.0 h1:saB74Etb4EAJNH3z74CVbCKk75hld/8T0CsXKetWCwM=
50
53
  github.com/google/logger v1.1.0/go.mod h1:w7O8nrRr0xufejBlQMI83MXqRusvREoJdaAxV+CoAB4=
51
54
  github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
52
55
  github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
56
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254 h1:JYoQR67E1vv1WGoeW8DkdFs7vrIEe/5wP+qJItd5tUE=
57
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254/go.mod h1:DPucAeQGDPUzYUt+NaWw6qsF5SFapWWToxEiVDh2aV0=
58
+ github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
59
+ github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
53
60
  github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
54
61
  github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
55
62
  github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
@@ -1,16 +1,4 @@
1
- require 'ffi'
2
- require 'fbcrawl_pb'
3
- module FbcrawlColly
4
- extend FFI::Library
5
1
 
6
- ffi_lib File.expand_path("../ext/fbcrawl_colly/fbcolly.so", File.dirname(__FILE__))
7
- attach_function :free, [ :pointer ], :void
2
+ module FbcrawlColly
8
3
 
9
- attach_function :Init, [], :pointer
10
- attach_function :Login, [:pointer, :string, :string], :void
11
- attach_function :FetchGroupFeed, [:pointer, :string], :string
12
- attach_function :FetchPost, [:pointer, :string, :string], :string
13
- # attach_function :FetchGroup, [:pointer, :string], :pointer
14
- # attach_function :Login, [:pointer, :string, :string, :string], :void
15
- # attach_function :FreePointer, [:pointer], :void
16
4
  end
@@ -0,0 +1,50 @@
1
+ require 'ffi'
2
+ require_relative '../fbcrawl_pb'
3
+ require_relative './ffi'
4
+
5
+ module FbcrawlColly
6
+ class Colly
7
+ def initialize
8
+ super
9
+ @colly = ::FFI::AutoPointer.new(FbcrawlColly::FFI::Init(), FbcrawlColly::FFI.method(:FreeColly))
10
+ end
11
+
12
+ def login(email, password)
13
+ s, ptr = FbcrawlColly::FFI.Login(@colly, email, password)
14
+ FbcrawlColly::FFI.free(ptr)
15
+ s
16
+ end
17
+
18
+ def login_with_cookies(cookies)
19
+ FbcrawlColly::FFI.LoginWithCookies(@colly, cookies)
20
+ end
21
+
22
+ def fetch_group_feed(group_id)
23
+ s, ptr = FbcrawlColly::FFI.FetchGroupFeed(@colly, group_id)
24
+ list = FbcrawlColly::FacebookPostList.decode(s)
25
+ FbcrawlColly::FFI.free(ptr)
26
+ list
27
+ end
28
+
29
+ def fetch_post(group_id, post_id)
30
+ s, ptr = FbcrawlColly::FFI.FetchPost(@colly, group_id, post_id)
31
+ post = FbcrawlColly::FacebookPost.decode(s)
32
+ FbcrawlColly::FFI.free(ptr)
33
+ post
34
+ end
35
+
36
+ def fetch_content_images(post_id)
37
+ s, ptr = FbcrawlColly::FFI.FetchContentImages(@colly, post_id)
38
+ imageList = FbcrawlColly::FacebookImageList.decode(s)
39
+ FbcrawlColly::FFI.free(ptr)
40
+ imageList
41
+ end
42
+
43
+ def fetch_image_url(image_id)
44
+ s, ptr = FbcrawlColly::FFI.FetchImageUrl(@colly, image_id)
45
+ image = FbcrawlColly::FacebookImage.decode(s)
46
+ FbcrawlColly::FFI.free(ptr)
47
+ image
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,17 @@
1
+ require 'ffi'
2
+ module FbcrawlColly::FFI
3
+ extend FFI::Library
4
+
5
+ ffi_lib File.expand_path("../../ext/fbcrawl_colly/fbcolly.so", File.dirname(__FILE__))
6
+ attach_function :free, [ :pointer ], :void
7
+
8
+ attach_function :Init, [], :pointer
9
+ attach_function :FreeColly, [:pointer], :pointer
10
+ attach_function :Login, [:pointer, :string, :string], :strptr
11
+ attach_function :LoginWithCookies, [:pointer, :string], :void
12
+ attach_function :FetchGroupFeed, [:pointer, :int64], :strptr
13
+ attach_function :FetchPost, [:pointer, :int64, :int64], :strptr
14
+ attach_function :FetchContentImages, [:pointer, :int64], :strptr
15
+ attach_function :FetchImageUrl, [:pointer, :int64], :strptr
16
+ # attach_function :FetchGroup, [:pointer, :string], :pointer
17
+ end
@@ -1,3 +1,3 @@
1
1
  module FbcrawlColly
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/main.go CHANGED
@@ -27,36 +27,69 @@ var password = flag.String("password", "change_me", "facebook password")
27
27
  var otp = flag.String("otp", "123456", "facebook otp")
28
28
  var groupId = flag.String("groupId", "334294967318328", "facebook group id, default is 334294967318328")
29
29
 
30
- var tmp = fbcolly.New()
30
+ var allInstances = map[uintptr]*fbcolly.Fbcolly{}
31
31
 
32
32
  //export Init
33
33
  func Init() uintptr {
34
- return (uintptr)(unsafe.Pointer(tmp))
34
+ instance := fbcolly.New()
35
+ ptr := (uintptr)(unsafe.Pointer(instance))
36
+ allInstances[ptr] = instance
37
+ return ptr
38
+ }
39
+
40
+ //export FreeColly
41
+ func FreeColly(pointer unsafe.Pointer) {
42
+ delete(allInstances, uintptr(pointer))
35
43
  }
36
44
 
37
45
  //export Login
38
- func Login(pointer unsafe.Pointer, email *C.char, password *C.char) {
46
+ func Login(pointer unsafe.Pointer, email *C.char, password *C.char) *C.char {
47
+ p := (*fbcolly.Fbcolly)(pointer)
48
+ cookies, err := p.Login(C.GoString(email), C.GoString(password), "")
49
+ if err == nil {
50
+ return C.CString(cookies)
51
+ }
52
+ return nil
53
+ }
54
+
55
+ //export LoginWithCookies
56
+ func LoginWithCookies(pointer unsafe.Pointer, cookies *C.char) {
39
57
  p := (*fbcolly.Fbcolly)(pointer)
40
- //print(p.E)
41
- p.Login(C.GoString(email), C.GoString(password), "")
58
+ p.LoginWithCookies(C.GoString(cookies))
42
59
  }
43
60
 
44
61
  //export FetchGroupFeed
45
- func FetchGroupFeed(pointer unsafe.Pointer, groupId *C.char) unsafe.Pointer {
62
+ func FetchGroupFeed(pointer unsafe.Pointer, groupId int64) unsafe.Pointer {
46
63
  p := (*fbcolly.Fbcolly)(pointer)
47
- _, postsList := p.FetchGroupFeed(C.GoString(groupId))
64
+ _, postsList := p.FetchGroupFeed(groupId)
48
65
  marshaledPostsList, _ := proto.Marshal(postsList)
49
66
  return C.CBytes(append(marshaledPostsList, 0))
50
67
  }
51
68
 
52
69
  //export FetchPost
53
- func FetchPost(pointer unsafe.Pointer, groupId *C.char, postId *C.char) unsafe.Pointer {
70
+ func FetchPost(pointer unsafe.Pointer, groupId int64, postId int64) unsafe.Pointer {
54
71
  p := (*fbcolly.Fbcolly)(pointer)
55
- _, post := p.FetchPost(C.GoString(groupId), C.GoString(postId))
72
+ _, post := p.FetchPost(groupId, postId)
56
73
  marshaledPost, _ := proto.Marshal(post)
57
74
  return C.CBytes(append(marshaledPost, 0))
58
75
  }
59
76
 
77
+ //export FetchContentImages
78
+ func FetchContentImages(pointer unsafe.Pointer, postId int64) unsafe.Pointer {
79
+ p := (*fbcolly.Fbcolly)(pointer)
80
+ _, imageList := p.FetchContentImages(postId)
81
+ marshaled, _ := proto.Marshal(imageList)
82
+ return C.CBytes(append(marshaled, 0))
83
+ }
84
+
85
+ //export FetchImageUrl
86
+ func FetchImageUrl(pointer unsafe.Pointer, imageId int64) unsafe.Pointer {
87
+ p := (*fbcolly.Fbcolly)(pointer)
88
+ _, image := p.FetchImageUrl(imageId)
89
+ marshaled, _ := proto.Marshal(image)
90
+ return C.CBytes(append(marshaled, 0))
91
+ }
92
+
60
93
  func main() {
61
94
  //r := regexp.MustCompile("/([\\d\\w.]+)").FindStringSubmatch()[1]
62
95
  //print(r.FindStringSubmatch("/liem.phamthanh.161?refid=18&__tn__=R")[1])
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fbcrawl-colly
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duy Le
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-05 00:00:00.000000000 Z
11
+ date: 2020-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: Crawl mbasic.facebook.com using GO Colly
42
56
  email:
43
57
  - duyleekun@gmail.com
@@ -47,9 +61,9 @@ extensions:
47
61
  extra_rdoc_files: []
48
62
  files:
49
63
  - ".gitignore"
50
- - ".travis.yml"
51
64
  - CODE_OF_CONDUCT.md
52
65
  - Gemfile
66
+ - Gemfile.lock
53
67
  - LICENSE.txt
54
68
  - README.md
55
69
  - Rakefile
@@ -64,7 +78,9 @@ files:
64
78
  - go.mod
65
79
  - go.sum
66
80
  - lib/fbcrawl-colly.rb
67
- - lib/fbcrawl-colly/version.rb
81
+ - lib/fbcrawl_colly/colly.rb
82
+ - lib/fbcrawl_colly/ffi.rb
83
+ - lib/fbcrawl_colly/version.rb
68
84
  - main.go
69
85
  homepage: http://github.com/duyleekun/fbcrawl-colly
70
86
  licenses:
@@ -1,6 +0,0 @@
1
- ---
2
- language: ruby
3
- cache: bundler
4
- rvm:
5
- - 2.7.1
6
- before_install: gem install bundler -v 2.1.4